Exploratory Data Analysis¶

In [1]:
pip install numpy pandas matplotlib seaborn scikit-learn nbconvert pyppeteer
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: numpy in /home/arao/.local/lib/python3.7/site-packages (1.21.6)
Requirement already satisfied: pandas in /home/arao/.local/lib/python3.7/site-packages (1.3.5)
Requirement already satisfied: matplotlib in /home/arao/.local/lib/python3.7/site-packages (3.5.3)
Requirement already satisfied: seaborn in /home/arao/.local/lib/python3.7/site-packages (0.12.2)
Requirement already satisfied: scikit-learn in /home/arao/.local/lib/python3.7/site-packages (1.0.2)
Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/site-packages (7.6.0)
Requirement already satisfied: pyppeteer in /home/arao/.local/lib/python3.7/site-packages (1.0.2)
Requirement already satisfied: pytz>=2017.3 in /home/arao/.local/lib/python3.7/site-packages (from pandas) (2024.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: fonttools>=4.22.0 in /home/arao/.local/lib/python3.7/site-packages (from matplotlib) (4.38.0)
Requirement already satisfied: pyparsing>=2.2.1 in /home/arao/.local/lib/python3.7/site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/arao/.local/lib/python3.7/site-packages (from matplotlib) (1.4.5)
Requirement already satisfied: cycler>=0.10 in /home/arao/.local/lib/python3.7/site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/site-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=6.2.0 in /home/arao/.local/lib/python3.7/site-packages (from matplotlib) (9.5.0)
Requirement already satisfied: typing_extensions; python_version < "3.8" in /usr/local/lib/python3.7/site-packages (from seaborn) (4.7.1)
Requirement already satisfied: joblib>=0.11 in /home/arao/.local/lib/python3.7/site-packages (from scikit-learn) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/arao/.local/lib/python3.7/site-packages (from scikit-learn) (3.1.0)
Requirement already satisfied: scipy>=1.1.0 in /home/arao/.local/lib/python3.7/site-packages (from scikit-learn) (1.7.3)
Requirement already satisfied: jinja2>=3.0 in /usr/local/lib/python3.7/site-packages (from nbconvert) (3.1.4)
Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/site-packages (from nbconvert) (1.5.1)
Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/site-packages (from nbconvert) (0.7.1)
Requirement already satisfied: pygments>=2.4.1 in /usr/local/lib/python3.7/site-packages (from nbconvert) (2.17.2)
Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.7/site-packages (from nbconvert) (0.7.4)
Requirement already satisfied: traitlets>=5.1 in /usr/local/lib/python3.7/site-packages (from nbconvert) (5.9.0)
Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.7/site-packages (from nbconvert) (0.2.2)
Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.7/site-packages (from nbconvert) (3.0.2)
Requirement already satisfied: markupsafe>=2.0 in /usr/local/lib64/python3.7/site-packages (from nbconvert) (2.1.5)
Requirement already satisfied: jupyter-core>=4.7 in /usr/local/lib/python3.7/site-packages (from nbconvert) (4.12.0)
Requirement already satisfied: tinycss2 in /usr/local/lib/python3.7/site-packages (from nbconvert) (1.2.1)
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.7/site-packages (from nbconvert) (4.12.3)
Requirement already satisfied: bleach!=5.0.0 in /usr/local/lib/python3.7/site-packages (from nbconvert) (6.0.0)
Requirement already satisfied: nbformat>=5.7 in /usr/local/lib/python3.7/site-packages (from nbconvert) (5.8.0)
Requirement already satisfied: importlib-metadata>=3.6; python_version < "3.10" in /usr/local/lib/python3.7/site-packages (from nbconvert) (6.7.0)
Requirement already satisfied: pyee<9.0.0,>=8.1.0 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (8.2.2)
Requirement already satisfied: websockets<11.0,>=10.0 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (10.4)
Requirement already satisfied: certifi>=2021 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (2024.6.2)
Requirement already satisfied: tqdm<5.0.0,>=4.42.1 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (4.66.4)
Requirement already satisfied: urllib3<2.0.0,>=1.25.8 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (1.26.19)
Requirement already satisfied: appdirs<2.0.0,>=1.4.3 in /home/arao/.local/lib/python3.7/site-packages (from pyppeteer) (1.4.4)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)
Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.7/site-packages (from nbclient>=0.5.0->nbconvert) (7.4.9)
Requirement already satisfied: webencodings>=0.4 in /usr/local/lib/python3.7/site-packages (from tinycss2->nbconvert) (0.5.1)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.7/site-packages (from beautifulsoup4->nbconvert) (2.4.1)
Requirement already satisfied: fastjsonschema in /usr/local/lib/python3.7/site-packages (from nbformat>=5.7->nbconvert) (2.20.0)
Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.7/site-packages (from nbformat>=5.7->nbconvert) (4.17.3)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/site-packages (from importlib-metadata>=3.6; python_version < "3.10"->nbconvert) (3.15.0)
Requirement already satisfied: entrypoints in /usr/local/lib/python3.7/site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (0.4)
Requirement already satisfied: pyzmq>=23.0 in /usr/local/lib64/python3.7/site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (26.0.3)
Requirement already satisfied: tornado>=6.2 in /usr/local/lib64/python3.7/site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (6.2)
Requirement already satisfied: nest-asyncio>=1.5.4 in /usr/local/lib/python3.7/site-packages (from jupyter-client>=6.1.12->nbclient>=0.5.0->nbconvert) (1.6.0)
Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (23.2.0)
Requirement already satisfied: importlib-resources>=1.4.0; python_version < "3.9" in /usr/local/lib/python3.7/site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (5.12.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (0.19.3)
Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < "3.9" in /usr/local/lib/python3.7/site-packages (from jsonschema>=2.6->nbformat>=5.7->nbconvert) (1.3.10)
Note: you may need to restart the kernel to use updated packages.
In [2]:
import pandas as pd # dataframes
import numpy as np # convert to numeric values for inputs to model 
import matplotlib as plt # used for visualizations (graphs, heatmaps, etc)

Convert data to utf-8 encoding from ANSI¶

In [3]:
# Each dataframe is set to a file from the data folder. 

df_accounts = pd.read_csv('data/Accounts.csv', sep=',', encoding='latin-1', low_memory=False)
df_distributor = pd.read_csv('data/Distributor.csv', sep=',', encoding='latin-1', low_memory=False)
df_order_data = pd.read_csv('data/order_data.csv', sep=',', encoding='latin-1', low_memory=False)
df_part_attributes = pd.read_csv('data/PartAttributes.csv',sep=',', encoding='latin-1', low_memory=False)

# this has on_bad_lines = skip, because some rows in the dataset were creating errors when reading
df_parts = pd.read_csv('data/Parts.csv', sep='|', encoding='latin-1', low_memory=False, on_bad_lines='skip')


df_pos_data = pd.read_csv('data/pos_data.csv', sep=',', encoding='latin-1', low_memory=False)
df_price_point = pd.read_csv('data/Price_point.csv', sep=',', encoding='latin-1', low_memory=False)
df_price_point_field = pd.read_csv('data/price_point_field.csv',sep=',', encoding='latin-1', low_memory=False)
df_pricing = pd.read_csv('data/Pricing.csv', sep=',', encoding='latin-1', low_memory=False)
df_quote_item = pd.read_csv('data/QuoteItem.csv', sep=',', encoding='latin-1', low_memory=False)
df_quotes = pd.read_csv('data/Quotes.csv', sep=',', encoding='latin-1', low_memory=False)
In [4]:
print(df_accounts['OID'])
0          2799360
1          2799364
2          2799365
3          2799366
4          2799370
            ...   
1689862    2012681
1689863    2012688
1689864    2012697
1689865    2012702
1689866    2012711
Name: OID, Length: 1689867, dtype: int64

List all columns in all .csv files to find relationships¶

In [5]:
dataframes_list = [df_accounts, df_distributor, df_order_data, df_part_attributes, df_parts, df_pos_data, df_price_point, df_price_point_field, df_pricing, df_quote_item, df_quotes]
dataframes_names = ["df_accounts", "df_distributor", "df_order_data", "df_part_attributes", "df_parts", "df_post_data", "df_price_point", "df_price_point_field", "df_pricing", "df_quote_item", "df_quotes"]
In [6]:
count = 0
for df in dataframes_list:
    print(dataframes_names[count])
    print(df.columns.tolist()) 
    print("\n")
    count += 1
df_accounts
['OID', 'OBSOLETE_FLAG', 'DISPLAY_NAME', 'LOCATION', 'LEGAL_NAME', 'ACCOUNT_NUM', 'TYPE', 'STATUS', 'DISTRIBUTOR_OID', 'DISTI_NAME', 'GEOGRAPHY_OID', 'GEO_NAME', 'CUST_CATEGORY_OID', 'CUST_CATE_NAME', 'DEFAULT_BILL_TO_ADDRESS_OID', 'BILL_TO_ADDRESS1', 'DEFAULT_SHIP_TO_ADDRESS_OID', 'SHIP_TO_ADDRESS1', 'SHIPMENT_METHOD_OID', 'SHIP_MTH_NAME', 'PRICING_GROUP_TYPE', 'PRICING_REGION_OID', 'PR_REGION', 'CUSTOMER_LEVEL_OID', 'TYPE_KEY', 'PARENT_DISPLAY_NAME', 'IS_BLANKET', 'CUSTOM_FIELD1', 'CUSTOM_FIELD2', 'CUSTOM_FIELD3', 'CUSTOM_FIELD4', 'CUSTOM_FIELD5', 'CUSTOM_FIELD6', 'CUSTOM_FIELD7', 'CUSTOM_FIELD8', 'CUSTOM_FIELD9', 'CUSTOM_FIELD10']


df_distributor
['OID', ' OBSOLETE_FLAG', ' NAME', ' DIST_CATEGORY_OID', ' DIST_CATE_NAME', ' STATUS', ' CAN_DEBIT', ' IS_STOCKING_REP', ' DISTI_CORPORATE_ID_OID', ' DISTI_CORP_NAME', ' PRICING_REGION_OID', ' PR_REGION', ' CAN_SPECIAL_BUY', ' CAN_BLANKET', ' CUSTOM_FIELD1', ' CUSTOM_FIELD2', ' CUSTOM_FIELD3', ' CUSTOM_FIELD4', ' CUSTOM_FIELD5', ' CUSTOM_FIELD6', ' CUSTOM_FIELD7', ' CUSTOM_FIELD8', ' CUSTOM_FIELD9', ' CUSTOM_FIELD10']


df_order_data
['QUOTE_NUM', 'NAME', 'CUSTOMER', 'MPN', 'QUANTITY', 'SCHED_DATE', 'PRICE', 'CURRENCY_INFO_OID', 'EXCHANGE_RATE', 'ACCUMULATED_USAGE', 'QUANTITY.1', 'REQUOTE_PRICE', 'ADJ_DISTI_COST', 'ADJ_DISTI_RESALE']


df_part_attributes
['PART_OID', 'MPN', 'PART_ATTRIBUTE_OID', 'OID', 'OBSOLETE_FLAG', 'PART_ATTRIBUTE_HEADER_OID', 'OBSOLETE_FLAG.1', 'PRODUCT_FAMILY_OID', 'FAMILY', 'NAME', 'SELECTOR_VALUE', 'ATTRIBUTE_VALUE', 'DESCRIPTION']


df_parts
['OID,OBSOLETE_FLAG,MPN,SMARTPART,DESCRIPTION,PRODUCT_FAMILY_OID,PROD_FAM_NAME,MIN,MULT,LEADTIME,LEADTIME_UNITS,INVENTORY,INVENTORY_DATE,SOLE_SOURCE,NCNR,PART_CLASS,PRICE_CATEGORY_OID,PRICE_CATE_NAME,PRODUCT_DIVISION,SUB_FAMILY_OID,SUB_FAMILY_NAME,ROOT,IS_ROOT,TYPE,CUSTOM_FIELD1,CUSTOM_FIELD2,CUSTOM_FIELD3,CUSTOM_FIELD4,CUSTOM_FIELD5,CUSTOM_FIELD6,CUSTOM_FIELD7,CUSTOM_FIELD8,CUSTOM_FIELD9,CUSTOM_FIELD10']


df_post_data
['POS#', 'CR_FLAG', 'MPN', 'DISTRIBUTOR', 'CUSTOMER', 'SHIP_QTY', 'SHIP_DATE', 'DEBIT_NUM', 'QUANTITY_REQ', 'QUANTITY_SHP', 'ACCEPTED_DISTI_COST', 'ACCEPTED_DISTI_RESALE', 'EFFECTIVE_DBC', 'OID', 'QUOTE_OID', 'QUANTITY', 'ACCUMULATED_USAGE']


df_price_point
['OID', 'OBSOLETE_FLAG', 'PART_OID', 'MPN', 'PRICE', 'PRICE_TABLE_DEFINITION_OID', 'NAME', 'PRICE_POINT_FIELD_OID', 'LONG_DISPLAY']


df_price_point_field
['OID', 'FIELD_NUM', 'LONG_DISPLAY', 'SHORT_DISPLAY']


df_pricing
['OID', 'PRICE', 'PRICE_TABLE_DEFINITION_OID', 'NAME']


df_quote_item
['OID', 'CREATED_DATE', 'MODIFIED_DATE', 'GUI_MODIFIED_DATE', 'OBSOLETE_FLAG', 'QUOTE_OID', 'QUOTE_NUM', 'ITEM_NUM', 'PART_OID', 'MPN', 'QUANTITY_REQUESTED', 'QUANTITY', 'START_DATE', 'OEM_PRICE', 'DISTI_COST', 'DISTI_RESALE', 'ADDERS_COST', 'TARGET_PRICE', 'COMPETITOR_OID', 'COMP_PART_OID', 'QUOTE_JUSTIFICATION', 'REQUOTE_PRICE', 'ADJ_DISTI_COST', 'ADJ_DISTI_RESALE', 'ORDER_ITEM_OID', 'DESIGN_PART_MAPPING_OID', 'STATUS', 'REF_PRICE', 'MINIMUM_PRICE', 'FIELD_MIN', 'DISTI_BOOK_COST', 'MANUFACTURER_COST', 'MPP_OID', 'STATUS.1', 'MPP_ITEM_OID', 'STATUS.2', 'EXPIRATION_DATE', 'RECOMMENDED_PRICE', 'CONTRACT_ITEM_OID', 'DEBIT_EXPIRATION_DATE', 'MPP_PRICE', 'CURRENCY_INFO_OID', 'CODE', 'EXCHANGE_RATE', 'SEGMENT_OID', 'DBC_EXCHANGE_RATE', 'PRICING_DESIGN_REG_OID', 'REG_NUM', 'BUSINESS_RULE_OID', 'STATUS.3', 'PRICING_CONVERSION_OID', 'STATUS.4', 'END_CUSTOMER_OID', 'DISPLAY_NAME', 'PROGRAM_OID', 'NAME', 'ASSEMBLY_OID', 'NAME.1', 'AUTO_REQUOTE_PRICE', 'AUTO_ADJ_DISTI_COST', 'AUTO_ADJ_DISTI_RESALE', 'IS_NO_BID', 'TYPE1_PRICING_CONVERSION_OID', 'OEM_PRICE_DEF_OID', 'NAME.2', 'DBC_PRICE_DEF_OID', 'NAME.3', 'IS_SPECIAL_BUY', 'WORKFLOW_STATUS', 'RECOMMENDED_COST', 'IS_BUDGETARY', 'LAST_APPROVED_DATE', 'RECOMMENDED_RESALE', 'ACCUMULATED_USAGE', 'SUB_ITEM_NUM', 'PART_TYPE', 'DEBIT_START_DATE', 'THRESHOLD_PRICE', 'REBATE_AMT', 'DISTI_REBATE_AMT', 'POCKET_PRICE', 'POCKET_COST_PRICE', 'RECOMMENDED_POCKET_PRICE', 'RECOMM_BUNDLE_POCKET_COST', 'DISTI_POCKET_PRICE', 'THRESHOLD_OVERRIDE', 'IS_UNLIMITED_QUANTITY', 'DEAL_SCORE', 'IS_MATRIX', 'QUANTITY_RECOMMENDED', 'DEAL_AUTH_PRICE', 'DEAL_AUTH_QUANTITY', 'DEAL_AUTH_BUSINESS_RULE_OID', 'STATUS.5', 'RECOMMENDED_REBATE_AMT', 'HIST_PRICE', 'HIST_DISTI_BOOK_COST', 'HIST_DISTI_COST', 'HIST_DISTI_RESALE', 'HIST_MANUFACTURER_COST', 'HIST_CONTRACT_ITEM_OID', 'HIST_MPP_ITEM_OID', 'HIST_QUOTE_ITEM_OID', 'CUSTOM_FIELD1', 'CUSTOM_FIELD2', 'CUSTOM_FIELD3', 'CUSTOM_FIELD4', 'CUSTOM_FIELD5', 'CUSTOM_FIELD6', 'CUSTOM_FIELD7', 'CUSTOM_FIELD8', 'CUSTOM_FIELD9', 'CUSTOM_FIELD10']


df_quotes
['OID', 'OBSOLETE_FLAG', 'CUSTOMER_OID', 'DISPLAY_NAME', 'SALES_CHANNEL', 'DISTRIBUTOR_OID', 'NAME', 'QUOTE_STATUS_OID', 'PREP_COMPLETE', 'IS_FORWARD', 'IS_CONTRACT', 'QUOTE_NUM', 'PRICING_GROUP_TYPE', 'IS_BLANKET', 'VOIDED_DATE', 'CUSTOM_FIELD1', 'CUSTOM_FIELD2', 'CUSTOM_FIELD3', 'CUSTOM_FIELD4', 'CUSTOM_FIELD5', 'CUSTOM_FIELD6', 'CUSTOM_FIELD7', 'CUSTOM_FIELD8', 'CUSTOM_FIELD9', 'CUSTOM_FIELD10']


Combining Datasets¶

  • see quantity relative to MPN in GEO area (mising data)

data availability problems with this¶

In [7]:
# merge quotes OID with MPN and PRICE, and then location to see how it's priced. then sort by location. 
#df_merge_quotedata = pd.merge(df_quotes, df_post_data, on=['OID'], suffixes=('_quotes', '_postdata'))
df_merge_quotedata = pd.merge(df_quotes, df_price_point, on=['OID'], suffixes=('_quotes2', '_pricepoint'))
df_merge_quotedata = pd.merge(df_merge_quotedata, df_accounts, on=['OID'], suffixes=('_quotedata', '_accounts'))
print(df_merge_quotedata.head())
df_merge_quotedata = df_merge_quotedata.filter(['DISPLAY_NAME', 'GEO_NAME', 'MPN', 'QUANTITY'])
# can filter broader into GEO_NAME 
print(df_merge_quotedata.head(5))

## here we do not get many values after the filtering.
     OID  OBSOLETE_FLAG_quotes2  CUSTOMER_OID  \
0  50816                      0       1192836   
1  55140                      0       2850714   

             DISPLAY_NAME_quotedata  SALES_CHANNEL  DISTRIBUTOR_OID_quotedata  \
0              vhwaqowewobxzaybbqju              1                        337   
1  IT SUPPLY AND OA COMPANY LIMITED              1                        337   

                       NAME_quotes2  QUOTE_STATUS_OID  PREP_COMPLETE  \
0  ARROW ASIA PAC LIMITED-HK-544821                 0              1   
1  ARROW ASIA PAC LIMITED-HK-544821                 0              1   

   IS_FORWARD  ...  CUSTOM_FIELD1_accounts CUSTOM_FIELD2_accounts  \
0           0  ...                     NaN                    NaN   
1           0  ...                     NaN                    NaN   

   CUSTOM_FIELD3_accounts  CUSTOM_FIELD4_accounts CUSTOM_FIELD5_accounts  \
0                     NaN                     NaN                    NaN   
1                     NaN                     NaN                    NaN   

   CUSTOM_FIELD6_accounts CUSTOM_FIELD7_accounts CUSTOM_FIELD8_accounts  \
0                     NaN                    NaN                    NaN   
1                     NaN                    NaN                    NaN   

  CUSTOM_FIELD9_accounts CUSTOM_FIELD10_accounts  
0                    NaN                     NaN  
1                    NaN                     NaN  

[2 rows x 69 columns]
  GEO_NAME             MPN
0     ASMO  AT80580PJ073JL
1     ASMO          CS4020
In [8]:
df_merged_accountsorder = pd.DataFrame()


# created a dataframe to merge the accountsa and quotes dataset based on the DISPLAY_NAME, which is the customer name. 
df_merged_accountsorder = pd.merge(df_accounts, df_quotes, on=['DISPLAY_NAME', ], suffixes=('_quoteitem', '_accounts'))

# print out the first 10 and last 10 values of the dataframe
print(df_merged_accountsorder.head(10))
print(df_merged_accountsorder.tail(10))


#print(df_merged_accountsorder.loc[0, ['OID', 'DISPLAY_NAME', 'GEO_NAME', 'MPN']])
   OID_quoteitem  OBSOLETE_FLAG_quoteitem       DISPLAY_NAME  LOCATION  \
0        1785883                        0                 AA        CA   
1        1230621                        0                 AA        CH   
2        1230633                        0                 AA        MY   
3        1230638                        0                 AA        US   
4        1992899                        0                 AA  11217275   
5        1992914                        0                 AA  11461370   
6        1992919                        0                 AA  12094695   
7        1799462                        0  American Tech Inc        FL   
8        1404172                        0  American Tech Inc        US   
9        2192259                        0  American Tech Inc  11092534   

          LEGAL_NAME ACCOUNT_NUM TYPE  STATUS  DISTRIBUTOR_OID_quoteitem  \
0                 AA         NaN  OEM  Active                          0   
1                 AA         NaN  OEM  Active                          0   
2                 AA         NaN  OEM  Active                          0   
3                 AA         NaN  OEM  Active                          0   
4                 AA         NaN  OEM  Active                          0   
5                 AA         NaN  OEM  Active                          0   
6                 AA         NaN  OEM  Active                          0   
7  American Tech Inc         NaN  REP  Active                          0   
8  American Tech Inc         NaN  REP  Active                          0   
9  American Tech Inc         NaN  REP  Active                          0   

  DISTI_NAME  ...  CUSTOM_FIELD1_accounts  \
0        NaN  ...                     NaN   
1        NaN  ...                     NaN   
2        NaN  ...                     NaN   
3        NaN  ...                     NaN   
4        NaN  ...                     NaN   
5        NaN  ...                     NaN   
6        NaN  ...                     NaN   
7        NaN  ...                     NaN   
8        NaN  ...                     NaN   
9        NaN  ...                     NaN   

                CUSTOM_FIELD2_accounts  CUSTOM_FIELD3_accounts  \
0                                  NaN                     NaN   
1                                  NaN                     NaN   
2                                  NaN                     NaN   
3                                  NaN                     NaN   
4                                  NaN                     NaN   
5                                  NaN                     NaN   
6                                  NaN                     NaN   
7  SMB (Med/Small biz <1000 employees)                     NaN   
8  SMB (Med/Small biz <1000 employees)                     NaN   
9  SMB (Med/Small biz <1000 employees)                     NaN   

  CUSTOM_FIELD4_accounts  CUSTOM_FIELD5_accounts  CUSTOM_FIELD6_accounts  \
0                    NaN                     NaN                     NaN   
1                    NaN                     NaN                     NaN   
2                    NaN                     NaN                     NaN   
3                    NaN                     NaN                     NaN   
4                    NaN                     NaN                     NaN   
5                    NaN                     NaN                     NaN   
6                    NaN                     NaN                     NaN   
7                    NaN      success@modeln.com                     NAR   
8                    NaN      success@modeln.com                     NAR   
9                    NaN      success@modeln.com                     NAR   

   CUSTOM_FIELD7_accounts CUSTOM_FIELD8_accounts  CUSTOM_FIELD9_accounts  \
0            (Select One)                     No                     NaN   
1            (Select One)                     No                     NaN   
2            (Select One)                     No                     NaN   
3            (Select One)                     No                     NaN   
4            (Select One)                     No                     NaN   
5            (Select One)                     No                     NaN   
6            (Select One)                     No                     NaN   
7              Enterprise                     No                     REP   
8              Enterprise                     No                     REP   
9              Enterprise                     No                     REP   

  CUSTOM_FIELD10_accounts  
0                 90 Days  
1                 90 Days  
2                 90 Days  
3                 90 Days  
4                 90 Days  
5                 90 Days  
6                 90 Days  
7                 90 Days  
8                 90 Days  
9                 90 Days  

[10 rows x 61 columns]
       OID_quoteitem  OBSOLETE_FLAG_quoteitem DISPLAY_NAME  LOCATION  \
45621        1936604                        0        05 RU  11929065   
45622        1936604                        0        05 RU  11929065   
45623        1936604                        0        05 RU  11929065   
45624        1936604                        0        05 RU  11929065   
45625        1936604                        0        05 RU  11929065   
45626        1936604                        0        05 RU  11929065   
45627        1936604                        0        05 RU  11929065   
45628        1936604                        0        05 RU  11929065   
45629        1936604                        0        05 RU  11929065   
45630        1936604                        0        05 RU  11929065   

      LEGAL_NAME ACCOUNT_NUM TYPE  STATUS  DISTRIBUTOR_OID_quoteitem  \
45621      05 RU         NaN  OEM  Active                          0   
45622      05 RU         NaN  OEM  Active                          0   
45623      05 RU         NaN  OEM  Active                          0   
45624      05 RU         NaN  OEM  Active                          0   
45625      05 RU         NaN  OEM  Active                          0   
45626      05 RU         NaN  OEM  Active                          0   
45627      05 RU         NaN  OEM  Active                          0   
45628      05 RU         NaN  OEM  Active                          0   
45629      05 RU         NaN  OEM  Active                          0   
45630      05 RU         NaN  OEM  Active                          0   

      DISTI_NAME  ...  CUSTOM_FIELD1_accounts  \
45621        NaN  ...                     NaN   
45622        NaN  ...                     NaN   
45623        NaN  ...                     NaN   
45624        NaN  ...                     NaN   
45625        NaN  ...                     NaN   
45626        NaN  ...                     NaN   
45627        NaN  ...                     NaN   
45628        NaN  ...                     NaN   
45629        NaN  ...                     NaN   
45630        NaN  ...                     NaN   

                 CUSTOM_FIELD2_accounts  CUSTOM_FIELD3_accounts  \
45621  Corp (Large biz >1000 employees)                     NaN   
45622  Corp (Large biz >1000 employees)                     NaN   
45623  Corp (Large biz >1000 employees)                     NaN   
45624  Corp (Large biz >1000 employees)                     NaN   
45625  Corp (Large biz >1000 employees)                     NaN   
45626  Corp (Large biz >1000 employees)                     NaN   
45627                               NaN                     NaN   
45628                               NaN                     NaN   
45629                               NaN                     NaN   
45630                               NaN                     NaN   

      CUSTOM_FIELD4_accounts          CUSTOM_FIELD5_accounts  \
45621                    NaN              success@modeln.com   
45622                    NaN              success@modeln.com   
45623                    NaN              success@modeln.com   
45624                    NaN              success@modeln.com   
45625                    NaN              success@modeln.com   
45626                    NaN              success@modeln.com   
45627                    NaN  intel-revvy-testing@modeln.com   
45628                    NaN  intel-revvy-testing@modeln.com   
45629                    NaN  intel-revvy-testing@modeln.com   
45630                    NaN  intel-revvy-testing@modeln.com   

       CUSTOM_FIELD6_accounts  CUSTOM_FIELD7_accounts CUSTOM_FIELD8_accounts  \
45621                    EU02            (Select One)                     No   
45622                    EU02            (Select One)                     No   
45623                    EU02            (Select One)                     No   
45624                    EU02            (Select One)                     No   
45625                    EU02            (Select One)                     No   
45626                    EU02            (Select One)                     No   
45627                     NaN            (Select One)                     No   
45628                     NaN            (Select One)                     No   
45629                     NaN            (Select One)                     No   
45630                     NaN            (Select One)                     No   

       CUSTOM_FIELD9_accounts CUSTOM_FIELD10_accounts  
45621                   Other                 90 Days  
45622                   Other                 90 Days  
45623                   Other                 90 Days  
45624                   Other                 90 Days  
45625                   Other                 90 Days  
45626                   Other                 90 Days  
45627                     OEM                 90 Days  
45628                     OEM                 90 Days  
45629                     OEM                 90 Days  
45630                     OEM                 90 Days  

[10 rows x 61 columns]

Orders by Location, printed out¶

In [9]:
# simple program to print out all the customers, location, and geographical names based on the Geographical Category.

df_merged_accountsorder_filter = df_merged_accountsorder.filter(['DISPLAY_NAME', 'LOCATION', 'GEO_NAME'])
#print(df_merged_accountsorder_filter)

for geo, group in df_merged_accountsorder_filter.groupby('GEO_NAME'):
    print(f"GEO CATEGORY: {geo}")
    print(group)
    print() 
GEO CATEGORY: APAC
           DISPLAY_NAME  LOCATION GEO_NAME
2                    AA        MY     APAC
5                    AA  11461370     APAC
19                JABIL        MY     APAC
20                JABIL        MY     APAC
21                JABIL        MY     APAC
...                 ...       ...      ...
45570  ARRABON COMPUTER  12669265     APAC
45571  ARRABON COMPUTER  12669265     APAC
45572  ARRABON COMPUTER  12669265     APAC
45573  ARRABON COMPUTER  12669265     APAC
45574        Supermicro  11507167     APAC

[13375 rows x 3 columns]

GEO CATEGORY: ASMO
                                   DISPLAY_NAME  LOCATION GEO_NAME
0                                            AA        CA     ASMO
3                                            AA        US     ASMO
4                                            AA  11217275     ASMO
7                             American Tech Inc        FL     ASMO
8                             American Tech Inc        US     ASMO
...                                         ...       ...      ...
44021  W & D X COMERCIO E SERVICOS DE INFORMATI        BR     ASMO
44022  W & D X COMERCIO E SERVICOS DE INFORMATI        BR     ASMO
44023  W & D X COMERCIO E SERVICOS DE INFORMATI        BR     ASMO
44024  W & D X COMERCIO E SERVICOS DE INFORMATI        BR     ASMO
44025  W & D X COMERCIO E SERVICOS DE INFORMATI        BR     ASMO

[25503 rows x 3 columns]

GEO CATEGORY: EMEA
      DISPLAY_NAME  LOCATION GEO_NAME
1               AA        CH     EMEA
6               AA  12094695     EMEA
13     JABBLA BVBA        BE     EMEA
14     JABBLA BVBA  12743427     EMEA
77         SAMSUNG  11132482     EMEA
...            ...       ...      ...
45626        05 RU  11929065     EMEA
45627        05 RU  11929065     EMEA
45628        05 RU  11929065     EMEA
45629        05 RU  11929065     EMEA
45630        05 RU  11929065     EMEA

[6592 rows x 3 columns]

GEO CATEGORY: IJKK
          DISPLAY_NAME  LOCATION GEO_NAME
6729           TOSHIBA        JP     IJKK
6736           TOSHIBA  12363568     IJKK
6737           TOSHIBA  12377756     IJKK
6755           TOSHIBA  13054878     IJKK
41607      HPC SYSTEMS  11916221     IJKK
41608      HPC SYSTEMS  11916221     IJKK
41609      HPC SYSTEMS  11916221     IJKK
41610      HPC SYSTEMS  13052151     IJKK
41611      HPC SYSTEMS  13052151     IJKK
41612      HPC SYSTEMS  13052151     IJKK
41613      HPC SYSTEMS        JP     IJKK
41614      HPC SYSTEMS        JP     IJKK
41615      HPC SYSTEMS        JP     IJKK
43766  SYSKEY CO.,LTD.  12114516     IJKK
43767  SYSKEY CO.,LTD.        JP     IJKK

GEO CATEGORY: PROC
                                DISPLAY_NAME  LOCATION GEO_NAME
15                                     JABIL        CN     PROC
16                                     JABIL        CN     PROC
17                                     JABIL        CN     PROC
18                                     JABIL        CN     PROC
31                                     JABIL  12707029     PROC
...                                      ...       ...      ...
43931  ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD        CN     PROC
43932  ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD        HK     PROC
43933  ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD  12864592     PROC
43934  ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD  12864621     PROC
43935  ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD  13019748     PROC

[146 rows x 3 columns]

Total orders by Location¶

In [10]:
# answers to common business questions
print("Most orders from Asia-Pacific (APAC)")
total_APAC = df_merged_accountsorder_filter.loc[df_merged_accountsorder_filter['GEO_NAME'] == 'APAC', 'DISPLAY_NAME'].value_counts()
print(total_APAC, '\n')
     
print("Most orders from Peoples Republic of China (PROC)")
total_PROC = df_merged_accountsorder_filter.loc[df_merged_accountsorder_filter['GEO_NAME'] == 'PROC', 'DISPLAY_NAME'].value_counts()
print(total_PROC, '\n')

print("Most orders from America Sales and Marketing Organization (ASMO)")
total_ASMO = df_merged_accountsorder_filter.loc[df_merged_accountsorder_filter['GEO_NAME'] == 'ASMO', 'DISPLAY_NAME'].value_counts()
print(total_ASMO, '\n')

      
print("Most orders from Europe, Middle East, and Africa (EMEA)")
total_EMEA = df_merged_accountsorder_filter.loc[df_merged_accountsorder_filter['GEO_NAME'] == 'EMEA', 'DISPLAY_NAME'].value_counts()
print(total_EMEA, '\n')

print("Most orders from Japan (IJKK)")
total_IJKK = df_merged_accountsorder_filter.loc[df_merged_accountsorder_filter['GEO_NAME'] == 'IJKK', 'DISPLAY_NAME'].value_counts()
print(total_IJKK, '\n')
Most orders from Asia-Pacific (APAC)
IT SUPPLY AND OA COMPANY LIMITED     9192
ARRABON COMPUTER                     1548
CISCO                                 884
kevin                                 818
AC DIGITAL LIFESTYLES INC             625
ACA DIGITAL CORPORATION                77
DELL COMPUTER                          25
ACULA                                  24
IBM                                    18
ABBOTT LABORATORIES                    16
GIGABYTE                               12
WISTRON                                12
AAEON                                   9
ACROSSER                                8
NETZONE                                 8
JABIL                                   8
ABC                                     8
PEGATRON                                6
LG                                      6
INVENTEC                                6
PORTWELL                                5
TOSHIBA                                 5
AIC                                     4
GETAC TECHNOLOGY CORPORATION            4
ADVANTECH EQUIPMENT                     4
BD COMPUTER                             4
Clientron                               3
WINCOMM                                 3
ADVANSUS CORP.                          3
ELITEGROUP                              3
CELESTICA ELECTRONICS (M) SDN BHD       2
CASWELL INC                             2
AB Amanah                               2
AA                                      2
3 40-969                                2
A & A ENTERPRISES                       2
DUTA COMPUTER, PT                       2
ANCA                                    2
ALLIANCENP                              2
!HELP-F1                                2
PEGATRON CORP                           2
SIEMENS                                 2
Innocomm                                2
Supermicro                              1
Name: DISPLAY_NAME, dtype: int64 

Most orders from Peoples Republic of China (PROC)
ALCATEL SHANGHAI BELL                                       14
JABIL                                                       12
ARROW ASIA PAC LIMITED                                      12
PEGATRON                                                     9
Microchip Technolog                                          9
ARTEST                                                       8
Cisco                                                        8
Flextronics                                                  8
Dragon Technology Distribution                               6
ARTESYN TECHNOLOGIES ASIA-PACIFIC LTD                        5
SIEMENS                                                      5
MATROX ELECTRONIC SYSTEMS                                    4
BEIJING SENFETECH CO.LTD                                     4
Jabil Circuit (wuxi) Co., Ltd                                4
ABC                                                          4
ADVANTECH TECHNOLOGY CHINA LTD                               4
DONG GUAN Q & S ELECTRONIC MANUFACTURING COMPANY LIMITED     3
Kontron                                                      3
LENOVO                                                       3
Lenovo                                                       3
Flextek                                                      2
Flextronics Technology (Shanghai) Co., Ltd.                  2
AAEON                                                        2
ASTEELFLASH                                                  2
BEI FANG HENG YE CO LTD                                      2
PEGATRON CORP                                                2
CELESTICA ELECTRONICS (M) SDN BHD                            2
TOSHIBA                                                      2
GIFA Technology Limited.                                     2
Name: DISPLAY_NAME, dtype: int64 

Most orders from America Sales and Marketing Organization (ASMO)
CISCO                   4420
A & A Computers Inc.    2985
A & A Computers         1851
FLEXTRONICS /CITRIX     1044
A & A COMPUTERS INC      303
                        ... 
PT_BOTOTest316             1
PT_BOTOTest317             1
PT_BOTOTest318             1
PT_BOTOTest319             1
PT_AOTOTest441             1
Name: DISPLAY_NAME, Length: 10986, dtype: int64 

Most orders from Europe, Middle East, and Africa (EMEA)
"The New DeskTop" IT Trading AG         2764
CISCO                                    884
"ASC Advanced Systems and Computers"     884
"05 GROUP" LLC                           660
05 GROUP LLC                             386
                                        ... 
JABBLA BVBA                                2
WEARABLE CONCEPTS LIMITED                  2
3 B NEW AGE SA                             2
3CITY ELECTRONICS SP Z O O                 2
AA                                         2
Name: DISPLAY_NAME, Length: 76, dtype: int64 

Most orders from Japan (IJKK)
HPC SYSTEMS        9
TOSHIBA            4
SYSKEY CO.,LTD.    2
Name: DISPLAY_NAME, dtype: int64 

Visualization of Most Orders by Region¶

In [11]:
import matplotlib.pyplot as plt

region_total_list = [total_APAC, total_PROC, total_ASMO, total_EMEA, total_IJKK]
region_total_list_name = ['Asia-Pacific', 'Peoples Republic of China', 'America Sales and Marketing Organization', 'Europe, Middle East, and Africa', 'Japan?']

# probably can enumerate this instead instead of incrementing with count variable. 
count = 0
for region in region_total_list:
    region_top = region.head(10)
    plt.title(region_total_list_name[count] + " Top 10 Quotes Placed")
    plt.xlabel("Customer")
    plt.xticks(rotation=85)
    plt.ylabel("Orders")
    plt.bar(region_top.index, region_top.values)
    plt.show()
    count += 1

Total orders by Region¶

In [12]:
geo_meaning = {"APAC": "Asia-Pacific", 
               "PROC": "Peoples Republic of China", 
               "ASMO": "America Sales and Marketing Organization", 
               "EMEA": "Europe, Middle East, and Africa", 
               "IJKK": "Japan?"} ## not sure if this is JAPAN

df_merged_accounts_total = df_merged_accountsorder_filter

# finding the total amount of orders by region by returning a new series with the GEO_Names summed up
df_merged_accounts_total['GEO_NAME'] = df_merged_accounts_total['GEO_NAME'].apply(lambda x: 'Asia-Pacific' if x == 'APAC' else 'Peoples Republic of China' if x == 'PROC' else 'America Sales and Marketing Organization' if x == 'ASMO' else 'Europe, Middle East, and Africa' if x == 'EMEA' else 'Japan?' if x == 'IJKK' else x)
df_merged_accounts_total = df_merged_accounts_total.groupby('GEO_NAME').size().sort_values(ascending=False)

print(df_merged_accounts_total.to_string(index=True))
GEO_NAME
America Sales and Marketing Organization    25503
Asia-Pacific                                13375
Europe, Middle East, and Africa              6592
Peoples Republic of China                     146
Japan?                                         15
In [13]:
# Visualization with matplotlib
pie_labels = ["America Sales and Marketing Organization","Asia-Pacific","Europe, Middle East, and Africa", "Peoples Republic of China", "Japan"  ]
plt.bar(df_merged_accounts_total.index, df_merged_accounts_total.values)
plt.xticks(rotation=70)
plt.xlabel('Geographical Region')
plt.rcParams["figure.figsize"] = (10, 8)
plt.title("Top quotes placed by Geographical Region")
plt.show()

Quote Approvals¶

In [14]:
# Here I handled duplicate rows by manually identifying the right one, and setting the .4 suffix for thes STATUS. In pandas, duplicate columns will have .1, .2, .3, etc...
df_quote_item_analysis = df_quote_item
df_quote_item_analysis = df_quote_item.filter(['OID','QUOTE_NUM', 'QUANTITY_REQUESTED', 'MPN', 'DISTI_BOOK_COST', 'WORKFLOW_STATUS', 'STATUS.4', 'DISTI_REBATE_AMT', 'ITEM_NUM', 'PART_OID', 'REBATE_AMT'])
In [15]:
# check all quotes that have been "approved" THen check if they have a distributor cost associated with it. 
# bq is column

status_cleaned = df_quote_item_analysis
status_cleaned['ACTUAL_STATUS'] = np.where((df_quote_item_analysis['WORKFLOW_STATUS'] == 'Approved') | (df_quote_item_analysis['STATUS.4'] =='Approved'), 'Approved', 'Other')
status_cleaned['ACTUAL_STATUS'] = status_cleaned['ACTUAL_STATUS'].replace('\xa0', '')
status_cleaned['WORKFLOW_STATUS'] = df_quote_item_analysis['WORKFLOW_STATUS']
status_cleaned['STATUS'] = df_quote_item_analysis['STATUS.4']
print(status_cleaned['ACTUAL_STATUS'].str.count("Approved").sum())

# handling of NA values with .notnull and .isnull
df_disti_yes_approved_count = status_cleaned[(status_cleaned['ACTUAL_STATUS'] == 'Approved') & (status_cleaned['DISTI_BOOK_COST'].notnull())].shape[0] 
df_disti_yes_unapproved_count = status_cleaned[(status_cleaned['ACTUAL_STATUS'] == 'Other') & (status_cleaned['DISTI_BOOK_COST'].notnull())].shape[0]

# save them into dataframes, and don't count them.  total amount of NO DBC Approved and NO DBC Unapproved respectively
df_disti_yes_approved = status_cleaned[(status_cleaned['ACTUAL_STATUS'] == 'Approved') & (status_cleaned['DISTI_BOOK_COST'].notnull())]
df_disti_yes_unapproved = status_cleaned[(status_cleaned['ACTUAL_STATUS'] == 'Other') & (status_cleaned['DISTI_BOOK_COST'].notnull())]


# No DBC, Approved Quotes
df_disti_no_approved_count = status_cleaned[(status_cleaned['DISTI_BOOK_COST'].isnull()) & ((status_cleaned['WORKFLOW_STATUS']=='Approved') | (status_cleaned['STATUS']) == 'Approved')].shape[0]

# No DBC, Unapproved QUotes
df_disti_no_unapproved_count = status_cleaned[((status_cleaned['WORKFLOW_STATUS']=='Open') | (status_cleaned['WORKFLOW_STATUS']=='Submitted') | (status_cleaned['WORKFLOW_STATUS']=='Quoted')  & ((status_cleaned['WORKFLOW_STATUS']!='Approved') | (status_cleaned['STATUS']!='Approved')) & (status_cleaned['DISTI_BOOK_COST'].isnull()))].shape[0]
total_dbc = status_cleaned['DISTI_BOOK_COST'].notnull().sum()

# save into dataframes, don't count them. total amount of NO DBC Approved and NO DBC Unapproved respectively
df_disti_no_approved = status_cleaned[(status_cleaned['DISTI_BOOK_COST'].isnull()) & ((status_cleaned['WORKFLOW_STATUS']=='Approved') | (status_cleaned['STATUS']) == 'Approved')].shape[0]
df_disti_no_unapproved = status_cleaned[((status_cleaned['WORKFLOW_STATUS']=='Open') | (status_cleaned['WORKFLOW_STATUS']=='Submitted') | (status_cleaned['WORKFLOW_STATUS']=='Quoted')  & ((status_cleaned['WORKFLOW_STATUS']!='Approved') | (status_cleaned['STATUS']!='Approved')) & (status_cleaned['DISTI_BOOK_COST'].isnull()))].shape[0]

print("Total DBC Quotes: ", total_dbc, '\n')


print("DBC QUOTE APPROVED: ", df_disti_yes_approved_count)
print("DBC QUOTE DENIED: ", df_disti_yes_unapproved_count, '\n')

print("NO DBC, QUOTE APPROVED: ",df_disti_no_approved_count) # wrong
print("NO DBC, QUOTE DENIED", df_disti_no_unapproved_count) # wrong


print("Total successful dbc quotes percent: ", df_disti_yes_approved_count / total_dbc * 100,"%")
47110
Total DBC Quotes:  123242 

DBC QUOTE APPROVED:  47110
DBC QUOTE DENIED:  76132 

NO DBC, QUOTE APPROVED:  0
NO DBC, QUOTE DENIED 247002
Total successful dbc quotes percent:  38.225604907417924 %

Correlation Matrices¶

In [ ]:
 
In [16]:
### df_quote_item
# Nominal encoding and visualization
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

#print(df_quote_item_analysis)

## here we label encode the string values, so correlation matrix can handle better. 
le = LabelEncoder()
df_dbc_quote_success_matrix = df_quote_item_analysis 
df_dbc_quote_success_matrix['STATUS_ENCODED'] = le.fit_transform(df_dbc_quote_success_matrix['ACTUAL_STATUS'])
#df_dbc_quote_success_matrix['MPN'] = le.fit(transform())

#print(df_dbc_quote_success_matrix)
#df_dbc_quote_success_matrix.dropna(how='any', axis=1, inplace=True)

#df_dbc_quote_success_matrix =df_dbc_quote_success_matrix.select_dtypes(include=[int, float])

df_dbc_quote_success_matrix2 = df_dbc_quote_success_matrix
df_quote_item_encoded = df_dbc_quote_success_matrix2
print(df_quote_item_encoded['ACTUAL_STATUS'].str.count("Approved").sum())

dbc_quote_success_matrix = df_dbc_quote_success_matrix.corr()
#dbc_quote_success_matrix = df_dbc_quote_success_matrix[['STATUS_ENCODED', 'DISTI_REBATE_AMT', 'DISTI_BOOK_COST', 'ITEM_NUM', 'PART_OID', 'REBATE_AMT']]

sns.heatmap(dbc_quote_success_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Heatmap of Correlation between Features relating to Quote Item Attributes')
plt.xlabel('Features')
plt.ylabel('Features')
plt.xticks(range(len(dbc_quote_success_matrix.columns)), dbc_quote_success_matrix.columns)
plt.yticks(range(len(dbc_quote_success_matrix.columns)), dbc_quote_success_matrix.columns)
plt.rcParams["figure.figsize"] = (6, 6)

## here we can see that this correlation matrix is broken apart into sections, which means this datatsets attributes are lowly correlated with each other. 
47110
In [17]:
# most related values sorting
quote_item_most_related = dbc_quote_success_matrix.stack().sort_values(ascending=False)
quote_item_most_related.columns = ['Feature_1', 'Feature_2', 'Relation_Score']
print(quote_item_most_related.columns)
# here we define the threshold, to print out the most relevant features. Extremely High and Extremely Low features do not mean better features, just more correlation. 
print(quote_item_most_related[((quote_item_most_related < 0.99) & (quote_item_most_related > 0.20) ) | ((quote_item_most_related <= -0.25) & (quote_item_most_related > -0.99))].sort_values(ascending=True))
['Feature_1', 'Feature_2', 'Relation_Score']
STATUS_ENCODED      PART_OID             -0.814218
PART_OID            STATUS_ENCODED       -0.814218
OID                 ITEM_NUM             -0.468479
ITEM_NUM            OID                  -0.468479
OID                 QUANTITY_REQUESTED   -0.444832
QUANTITY_REQUESTED  OID                  -0.444832
REBATE_AMT          ITEM_NUM             -0.263772
ITEM_NUM            REBATE_AMT           -0.263772
STATUS_ENCODED      DISTI_REBATE_AMT      0.228815
DISTI_REBATE_AMT    STATUS_ENCODED        0.228815
OID                 DISTI_REBATE_AMT      0.255496
DISTI_REBATE_AMT    OID                   0.255496
ITEM_NUM            QUANTITY_REQUESTED    0.313168
QUANTITY_REQUESTED  ITEM_NUM              0.313168
REBATE_AMT          OID                   0.472417
OID                 REBATE_AMT            0.472417
dtype: float64
In [18]:
# df_order_data
le_order_data = LabelEncoder()
df_order_data_analysis = df_order_data
order_data_matrix = df_order_data_analysis
## here we label encode the string values, so correlation matrix can handle better. 
order_data_matrix['MPN'] = le_order_data.fit_transform(df_order_data_analysis['MPN'])
order_data_matrix['QUOTE_NUM'] = le_order_data.fit_transform(df_order_data_analysis['QUOTE_NUM'])


# take specific attributes from the order_data dataset to put into a correlation matrix
order_data_matrix = order_data_matrix[['QUOTE_NUM', 'MPN', 'ACCUMULATED_USAGE', 'QUANTITY', 'QUANTITY.1', 'REQUOTE_PRICE', 'ADJ_DISTI_COST', 'ADJ_DISTI_RESALE']].corr()



sns.heatmap(order_data_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Heatmap of Correlation between Features relating to Order Data')
plt.xlabel('Features')
plt.ylabel('Features')
plt.xticks(range(len(order_data_matrix.columns)), order_data_matrix.columns)
plt.yticks(range(len(order_data_matrix.columns)), order_data_matrix.columns)
plt.rcParams["figure.figsize"] = (12, 8)
plt.show()

# most related values
order_data_most_related = order_data_matrix.stack().sort_values(ascending=False)

# here we define the threshold, to print out the most relevant features. Extremely High and Extremely Low features do not mean better features, just more correlation. 

order_data_most_related = order_data_most_related[((order_data_most_related < 0.99) & (order_data_most_related > 0.4) ) | ((order_data_most_related <= -0.4) & (order_data_most_related > -0.99))].sort_values(ascending=True)
order_data_most_related.columns = ['Feature_1', 'Feature_2', 'Relation_Score']
print(order_data_most_related.columns[0], "                ", order_data_most_related.columns[1], "             ", order_data_most_related.columns[2])
print(order_data_most_related)
Feature_1                  Feature_2               Relation_Score
ACCUMULATED_USAGE  QUOTE_NUM           -0.866738
QUOTE_NUM          ACCUMULATED_USAGE   -0.866738
QUANTITY.1         ADJ_DISTI_COST      -0.781447
ADJ_DISTI_RESALE   QUANTITY.1          -0.781447
QUANTITY.1         ADJ_DISTI_RESALE    -0.781447
ADJ_DISTI_COST     QUANTITY.1          -0.781447
QUANTITY           ACCUMULATED_USAGE   -0.652003
ACCUMULATED_USAGE  QUANTITY            -0.652003
QUANTITY.1         QUOTE_NUM           -0.436466
QUOTE_NUM          QUANTITY.1          -0.436466
QUANTITY           QUOTE_NUM            0.559781
QUOTE_NUM          QUANTITY             0.559781
dtype: float64
In [19]:
### df_part_attributes
le_part_attributes = LabelEncoder()
df_part_attributes_analysis = df_part_attributes
part_attributes_matrix = df_part_attributes_analysis

## here we label encode the string values, so correlation matrix can handle better. 
part_attributes_matrix['FAMILY'] = le_part_attributes.fit_transform(df_part_attributes_analysis['FAMILY'])
part_attributes_matrix['NAME'] = le_part_attributes.fit_transform(df_part_attributes_analysis['NAME'])
part_attributes_matrix['SELECTOR_VALUE'] = le_part_attributes.fit_transform(df_part_attributes_analysis['SELECTOR_VALUE'])
part_attributes_matrix['ATTRIBUTE_VALUE'] = le_part_attributes.fit_transform(df_part_attributes_analysis['ATTRIBUTE_VALUE'])
part_attributes_matrix['DESCRIPTION'] = le_part_attributes.fit_transform(df_part_attributes_analysis['DESCRIPTION'])


part_attributes_matrix = part_attributes_matrix[['PART_OID', 'MPN', 'PART_ATTRIBUTE_HEADER_OID', 'PRODUCT_FAMILY_OID', 'NAME', 'SELECTOR_VALUE', 'ATTRIBUTE_VALUE', 'DESCRIPTION']].corr()



sns.heatmap(part_attributes_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Heatmap of Correlation between Features relating to Part Attributes')
plt.xlabel('Features')
plt.ylabel('Features')
plt.xticks(range(len(part_attributes_matrix.columns)), part_attributes_matrix.columns)
plt.yticks(range(len(part_attributes_matrix.columns)), part_attributes_matrix.columns)
plt.rcParams["figure.figsize"] = (10, 8)
plt.show()

# most related values
part_attributes_most_related = part_attributes_matrix.stack().sort_values(ascending=False)

# here we define the threshold, to print out the most relevant features. Extremely High and Extremely Low features do not mean better features, just more correlation. 

part_attributes_most_related = part_attributes_most_related[((part_attributes_most_related < 0.99) & (part_attributes_most_related > 0.3) ) | ((part_attributes_most_related <= -0.3) & (part_attributes_most_related > -0.99))].sort_values(ascending=True)
part_attributes_most_related.columns = ['Feature_1', 'Feature_2', 'Relation_Score']
print(part_attributes_most_related.columns[0], "                ", part_attributes_most_related.columns[1], "             ", part_attributes_most_related.columns[2])
print(part_attributes_most_related)
Feature_1                  Feature_2               Relation_Score
PART_OID                   PRODUCT_FAMILY_OID          -0.311405
PRODUCT_FAMILY_OID         PART_OID                    -0.311405
PART_ATTRIBUTE_HEADER_OID  PRODUCT_FAMILY_OID           0.347231
PRODUCT_FAMILY_OID         PART_ATTRIBUTE_HEADER_OID    0.347231
DESCRIPTION                NAME                         0.939300
NAME                       DESCRIPTION                  0.939300
dtype: float64
In [20]:
### df_pos_data
le_pos_data = LabelEncoder()
df_pos_data_analysis = df_pos_data
pos_data_matrix = df_pos_data_analysis

## here we label encode the string values, so correlation matrix can handle better. 
pos_data_matrix['DISTRIBUTOR'] = le_part_attributes.fit_transform(df_pos_data_analysis['DISTRIBUTOR'])
pos_data_matrix['CUSTOMER'] = le_part_attributes.fit_transform(df_pos_data_analysis['CUSTOMER'])
pos_data_matrix['SHIP_DATE'] = le_part_attributes.fit_transform(df_pos_data_analysis['SHIP_DATE'])

df_pos_data_encoded = pos_data_matrix

pos_data_matrix = pos_data_matrix[['POS#', 'DISTRIBUTOR', 'CUSTOMER', 'SHIP_QTY', 'SHIP_DATE', 'DEBIT_NUM', 'QUANTITY_REQ', 'QUANTITY_REQ', 'QUANTITY_SHP', 'ACCEPTED_DISTI_COST', 'ACCEPTED_DISTI_RESALE', 'EFFECTIVE_DBC', 'OID', 'ACCUMULATED_USAGE']].corr()
pos_data_matrix.dropna(axis=1, how='all', inplace=True) # drop the rows that have ALL NA values. 


sns.heatmap(pos_data_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Heatmap of Correlation between Features relating to Point of Sale Attributes')
plt.xlabel('Features')
plt.ylabel('Features')
plt.xticks(range(len(pos_data_matrix.columns)), pos_data_matrix.columns)
plt.yticks(range(len(pos_data_matrix.columns)), pos_data_matrix.columns)
plt.rcParams["figure.figsize"] = (10, 8)
plt.show()

# most related values
pos_data_most_related = pos_data_matrix.stack().sort_values(ascending=False)

# here we define the threshold, to print out the most relevant features. Extremely High and Extremely Low features do not mean better features, just more correlation. 

pos_data_most_related = pos_data_most_related[((pos_data_most_related < 0.99) & (pos_data_most_related > 0.3) ) | ((pos_data_most_related <= -0.3) & (pos_data_most_related > -0.99))].sort_values(ascending=True)
pos_data_most_related.columns = ['Feature_1', 'Feature_2', 'Relation_Score']
print(pos_data_most_related.columns[0], "                ", part_attributes_most_related.columns[1], "             ", part_attributes_most_related.columns[2])
print(part_attributes_most_related)
Feature_1                  Feature_2               Relation_Score
PART_OID                   PRODUCT_FAMILY_OID          -0.311405
PRODUCT_FAMILY_OID         PART_OID                    -0.311405
PART_ATTRIBUTE_HEADER_OID  PRODUCT_FAMILY_OID           0.347231
PRODUCT_FAMILY_OID         PART_ATTRIBUTE_HEADER_OID    0.347231
DESCRIPTION                NAME                         0.939300
NAME                       DESCRIPTION                  0.939300
dtype: float64
In [21]:
### df_quotes
le_quotes = LabelEncoder()
df_quotes_data_analysis = df_quotes
quotes_data_matrix = df_quotes_data_analysis

## here we label encode the string values, so correlation matrix can handle better. 
quotes_data_matrix['COMPANY_SIZE'] = le_quotes.fit_transform(df_quotes_data_analysis['CUSTOM_FIELD2'])
quotes_data_matrix['COMPANY_TYPE'] = le_quotes.fit_transform(df_quotes_data_analysis['CUSTOM_FIELD3'])
quotes_data_matrix['NAME'] = le_quotes.fit_transform(df_quotes_data_analysis['NAME'])
quotes_data_matrix['DISTRIBUTOR_OID'] = le_quotes.fit_transform(df_quotes_data_analysis['DISTRIBUTOR_OID'])
quotes_data_matrix['ENTERPRISE_CHECK'] = le_quotes.fit_transform(df_quotes_data_analysis['CUSTOM_FIELD9'])
quotes_data_matrix['VOIDED_DATE'] = le_quotes.fit_transform(df_quotes_data_analysis['VOIDED_DATE'])


df_quotes_encoded = quotes_data_matrix

quotes_data_matrix = quotes_data_matrix[['COMPANY_SIZE', 'COMPANY_TYPE', 'NAME', 'DISTRIBUTOR_OID', 'ENTERPRISE_CHECK', 'CUSTOMER_OID', 'SALES_CHANNEL', 'DISTRIBUTOR_OID', 'QUOTE_STATUS_OID', 'PREP_COMPLETE', 'IS_FORWARD', 'IS_CONTRACT', 'QUOTE_NUM', 'VOIDED_DATE','PRICING_GROUP_TYPE' ]].corr()
quotes_data_matrix.dropna(axis=1, how='all', inplace=True)


sns.heatmap(quotes_data_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Heatmap of Correlation between Features relating to Quotes Attributes')
plt.xlabel('Features')
plt.ylabel('Features')
plt.xticks(range(len(quotes_data_matrix.columns)), quotes_data_matrix.columns)
plt.yticks(range(len(quotes_data_matrix.columns)), quotes_data_matrix.columns)
plt.rcParams["figure.figsize"] = (10, 10)
plt.show()

# most related values
quotes_data_most_related = quotes_data_matrix.stack().sort_values(ascending=False)

# here we define the threshold, to print out the most relevant features. Extremely High and Extremely Low features do not mean better features, just more correlation. 
quotes_data_most_related = quotes_data_most_related[((quotes_data_most_related < 0.99) & (quotes_data_most_related > 0.5) ) | ((quotes_data_most_related <= -0.5) & (quotes_data_most_related > -0.99))].sort_values(ascending=True)
quotes_data_most_related.columns = ['Feature_1', 'Feature_2', 'Relation_Score']
print(quotes_data_most_related.columns[0], "                ", quotes_data_most_related.columns[1], "             ", quotes_data_most_related.columns[2])
print(quotes_data_most_related)
Feature_1                  Feature_2               Relation_Score
NAME              SALES_CHANNEL      -0.965678
SALES_CHANNEL     NAME               -0.965678
DISTRIBUTOR_OID   IS_CONTRACT        -0.941104
IS_CONTRACT       DISTRIBUTOR_OID    -0.941104
DISTRIBUTOR_OID   IS_CONTRACT        -0.941104
IS_CONTRACT       DISTRIBUTOR_OID    -0.941104
DISTRIBUTOR_OID   NAME               -0.926175
NAME              DISTRIBUTOR_OID    -0.926175
DISTRIBUTOR_OID   NAME               -0.926175
NAME              DISTRIBUTOR_OID    -0.926175
QUOTE_STATUS_OID  SALES_CHANNEL      -0.833603
SALES_CHANNEL     QUOTE_STATUS_OID   -0.833603
QUOTE_STATUS_OID  DISTRIBUTOR_OID    -0.817854
                  DISTRIBUTOR_OID    -0.817854
DISTRIBUTOR_OID   QUOTE_STATUS_OID   -0.817854
                  QUOTE_STATUS_OID   -0.817854
QUOTE_STATUS_OID  ENTERPRISE_CHECK   -0.814403
ENTERPRISE_CHECK  QUOTE_STATUS_OID   -0.814403
                  NAME               -0.700408
NAME              ENTERPRISE_CHECK   -0.700408
DISTRIBUTOR_OID   CUSTOMER_OID       -0.656421
CUSTOMER_OID      DISTRIBUTOR_OID    -0.656421
DISTRIBUTOR_OID   CUSTOMER_OID       -0.656421
CUSTOMER_OID      DISTRIBUTOR_OID    -0.656421
ENTERPRISE_CHECK  IS_CONTRACT        -0.654749
IS_CONTRACT       ENTERPRISE_CHECK   -0.654749
CUSTOMER_OID      SALES_CHANNEL      -0.651665
SALES_CHANNEL     CUSTOMER_OID       -0.651665
CUSTOMER_OID      ENTERPRISE_CHECK   -0.510266
ENTERPRISE_CHECK  CUSTOMER_OID       -0.510266
CUSTOMER_OID      NAME                0.605170
NAME              CUSTOMER_OID        0.605170
ENTERPRISE_CHECK  SALES_CHANNEL       0.649434
SALES_CHANNEL     ENTERPRISE_CHECK    0.649434
DISTRIBUTOR_OID   ENTERPRISE_CHECK    0.654878
                  ENTERPRISE_CHECK    0.654878
ENTERPRISE_CHECK  DISTRIBUTOR_OID     0.654878
                  DISTRIBUTOR_OID     0.654878
IS_CONTRACT       CUSTOMER_OID        0.656766
CUSTOMER_OID      IS_CONTRACT         0.656766
                  QUOTE_STATUS_OID    0.721604
QUOTE_STATUS_OID  CUSTOMER_OID        0.721604
                  NAME                0.837872
NAME              QUOTE_STATUS_OID    0.837872
IS_CONTRACT       QUOTE_STATUS_OID    0.838532
QUOTE_STATUS_OID  IS_CONTRACT         0.838532
COMPANY_SIZE      COMPANY_TYPE        0.883002
COMPANY_TYPE      COMPANY_SIZE        0.883002
DISTRIBUTOR_OID   SALES_CHANNEL       0.946361
SALES_CHANNEL     DISTRIBUTOR_OID     0.946361
DISTRIBUTOR_OID   SALES_CHANNEL       0.946361
SALES_CHANNEL     DISTRIBUTOR_OID     0.946361
IS_CONTRACT       NAME                0.960313
NAME              IS_CONTRACT         0.960313
dtype: float64

Detecting Outliers with Z-Score¶

In [22]:
## calculate total bins
total_values_quote_item = dbc_quote_success_matrix.values.flatten() 
print("Total values: ", len(total_values_quote_item))
quote_item_bins = int(np.sqrt(len(total_values_quote_item)))
print("Bins: ", quote_item_bins)

# total bins tells use total observations that can be considered outliers. 
Total values:  100
Bins:  10
In [23]:
from scipy import stats
import seaborn as sns

## let's take the outlier z-score for the quote_item dataset

zscores_quote_item = np.abs(stats.zscore(df_quote_item['PART_OID']))

# take outliers as part_oid quote_items with a value greater than 3. 
threshold_quote_item = 3
outliers_quote_item = df_quote_item[zscores_quote_item > threshold_quote_item]

print(outliers_quote_item.head(10))


'''
zscores_quote_item = dbc_quote_success_matrix.apply(stats.zscore)
#zscores_quote_item.dropna(inplace=True)

outliers_quote_item = np.abs(zscores_quote_item) > 2

outlier_values_quote_item = dbc_quote_success_matrix[outliers_quote_item]

sns.boxplot(data=outlier_values_quote_item)

plt.title("Box Plot Quote Item Outliers")

plt.show()



zscore_dbc_quote_success_matrix = zscore_dbc_quote_success_matrix.apply(lambda x: (x - x.mean()) / x.std())


outliers_quote_item = zscore_dbc_quote_success_matrix[(zscore_dbc_quote_success_matrix > 2) | (zscore_dbc_quote_success_matrix < -2)]

#outliers rows
outliers_quote_item_rows = zscore_dbc_quote_success_matrix[outliers_quote_item.any(axis=1)]
outlier_neat = pd.concat([outliers_quote_item_rows, outliers_quote_item[outliers_quote_item.any(axis=1)]], axis=1)


print("Outliers: ")
print(outlier_neat.head(10))

plt.hist(outlier_neat['DISTI_REBATE_AMT'])
plt.xlabel('DISTI REBATE AMT Z-SCORE')
plt.ylabel('Frequency')
plt.title('HISTOGRAM OF DISTI REBATE Z-scores')

'''
         OID          CREATED_DATE         MODIFIED_DATE  \
1716  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1717  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1718  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1719  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1720  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1721  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1722  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1723  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1724  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   
1725  297552  22-DEC-2015 21:48:24  23-MAR-2017 00:00:00   

         GUI_MODIFIED_DATE  OBSOLETE_FLAG  QUOTE_OID QUOTE_NUM  ITEM_NUM  \
1716  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1717  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1718  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1719  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1720  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1721  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1722  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1723  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1724  22-DEC-2015 21:51:27              0      11721    Q10358         1   
1725  22-DEC-2015 21:51:27              0      11721    Q10358         1   

      PART_OID     MPN  ...  CUSTOM_FIELD1  CUSTOM_FIELD2 CUSTOM_FIELD3  \
1716    194278  936893  ...            NaN            NaN           NaN   
1717    194278  936893  ...            NaN            NaN           NaN   
1718    194278  936893  ...            NaN            NaN           NaN   
1719    194278  936893  ...            NaN            NaN           NaN   
1720    194278  936893  ...            NaN            NaN           NaN   
1721    194278  936893  ...            NaN            NaN           NaN   
1722    194278  936893  ...            NaN            NaN           NaN   
1723    194278  936893  ...            NaN            NaN           NaN   
1724    194278  936893  ...            NaN            NaN           NaN   
1725    194278  936893  ...            NaN            NaN           NaN   

      CUSTOM_FIELD4  CUSTOM_FIELD5  CUSTOM_FIELD6  CUSTOM_FIELD7  \
1716            Yes            NaN            NaN            NaN   
1717            Yes            NaN            NaN            NaN   
1718            Yes            NaN            NaN            NaN   
1719            Yes            NaN            NaN            NaN   
1720            Yes            NaN            NaN            NaN   
1721            Yes            NaN            NaN            NaN   
1722            Yes            NaN            NaN            NaN   
1723            Yes            NaN            NaN            NaN   
1724            Yes            NaN            NaN            NaN   
1725            Yes            NaN            NaN            NaN   

      CUSTOM_FIELD8  CUSTOM_FIELD9  CUSTOM_FIELD10  
1716            NaN            NaN             NaN  
1717            NaN            NaN             NaN  
1718            NaN            NaN             NaN  
1719            NaN            NaN             NaN  
1720            NaN            NaN             NaN  
1721            NaN            NaN             NaN  
1722            NaN            NaN             NaN  
1723            NaN            NaN             NaN  
1724            NaN            NaN             NaN  
1725            NaN            NaN             NaN  

[10 rows x 113 columns]
Out[23]:
'\nzscores_quote_item = dbc_quote_success_matrix.apply(stats.zscore)\n#zscores_quote_item.dropna(inplace=True)\n\noutliers_quote_item = np.abs(zscores_quote_item) > 2\n\noutlier_values_quote_item = dbc_quote_success_matrix[outliers_quote_item]\n\nsns.boxplot(data=outlier_values_quote_item)\n\nplt.title("Box Plot Quote Item Outliers")\n\nplt.show()\n\n\n\nzscore_dbc_quote_success_matrix = zscore_dbc_quote_success_matrix.apply(lambda x: (x - x.mean()) / x.std())\n\n\noutliers_quote_item = zscore_dbc_quote_success_matrix[(zscore_dbc_quote_success_matrix > 2) | (zscore_dbc_quote_success_matrix < -2)]\n\n#outliers rows\noutliers_quote_item_rows = zscore_dbc_quote_success_matrix[outliers_quote_item.any(axis=1)]\noutlier_neat = pd.concat([outliers_quote_item_rows, outliers_quote_item[outliers_quote_item.any(axis=1)]], axis=1)\n\n\nprint("Outliers: ")\nprint(outlier_neat.head(10))\n\nplt.hist(outlier_neat[\'DISTI_REBATE_AMT\'])\nplt.xlabel(\'DISTI REBATE AMT Z-SCORE\')\nplt.ylabel(\'Frequency\')\nplt.title(\'HISTOGRAM OF DISTI REBATE Z-scores\')\n\n'
In [24]:
## IQR
print("test")

df_quote_item_all_encoded = df_quote_item
df_quote_item_all_encoded['STATUS_ENCODED'] = df_dbc_quote_success_matrix['STATUS_ENCODED']
df_quote_item_all_encoded.dropna(axis=1, how='all')

le_quote_item_encoded = LabelEncoder()

## encode all columns that have strings with LabelEncoder
for col in df_quote_item_all_encoded.columns:
    if (df_quote_item_all_encoded[col].dtype == 'object'):
        df_quote_item_all_encoded[col] = le_quote_item_encoded.fit_transform(df_quote_item_all_encoded[col])
    
## here we identify values that are less than 1.5 times than interquartile range or 1.5 above the interquartile range as an outlier. 
for attribute in df_quote_item_all_encoded.columns:
    Q1 = df_quote_item[attribute].quantile(0.25)
    Q3 = df_quote_item[attribute].quantile(0.75)
    IQR = Q3 - Q1

    threshold  = 1.5
    outliers = df_quote_item[(df_quote_item[attribute] < Q1 - threshold * IQR) | (df_quote_item[attribute] > Q3 + threshold * IQR)]
    
    print(f"Outliers in {attribute}")
    print('Mean: ', df_quote_item[attribute].mean())
    print('Median: ', df_quote_item[attribute].median())
    print('Standard Deviation: ', df_quote_item[attribute].std())
    print('Number of Outliers: ', len(outliers))
    print('\n')
    
test
Outliers in OID
Mean:  2098208.8348294436
Median:  1733352.0
Standard Deviation:  748630.7643580899
Number of Outliers:  93226


Outliers in CREATED_DATE
Mean:  151.7884500299222
Median:  168.0
Standard Deviation:  60.95435673463235
Number of Outliers:  25340


Outliers in MODIFIED_DATE
Mean:  104.52128216636744
Median:  106.0
Standard Deviation:  26.98658148268543
Number of Outliers:  71260


Outliers in GUI_MODIFIED_DATE
Mean:  192.84406792339917
Median:  210.0
Standard Deviation:  78.32329278042761
Number of Outliers:  99554


Outliers in OBSOLETE_FLAG
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in QUOTE_OID
Mean:  25530.28194195093
Median:  19883.0
Standard Deviation:  10529.09006012626
Number of Outliers:  93296


Outliers in QUOTE_NUM
Mean:  47.673099940155595
Median:  10.0
Standard Deviation:  61.594533771805914
Number of Outliers:  23870


Outliers in ITEM_NUM
Mean:  163.54039497307002
Median:  116.0
Standard Deviation:  164.75308915470703
Number of Outliers:  0


Outliers in PART_OID
Mean:  19078.02247905446
Median:  699.0
Standard Deviation:  53489.86733841716
Number of Outliers:  41776


Outliers in MPN
Mean:  307.6324431478157
Median:  333.0
Standard Deviation:  142.18222558015447
Number of Outliers:  0


Outliers in QUANTITY_REQUESTED
Mean:  50210.992033213646
Median:  10000.0
Standard Deviation:  55128.078620663946
Number of Outliers:  0


Outliers in QUANTITY
Mean:  50208.74880311191
Median:  10000.0
Standard Deviation:  55129.94869044122
Number of Outliers:  0


Outliers in START_DATE
Mean:  34.84328246558947
Median:  39.0
Standard Deviation:  12.483934148269004
Number of Outliers:  106554


Outliers in OEM_PRICE
Mean:  5487.96948261924
Median:  122.0
Standard Deviation:  17030.884260240404
Number of Outliers:  21420


Outliers in DISTI_COST
Mean:  3245.6367042987413
Median:  222.0
Standard Deviation:  12733.781826938079
Number of Outliers:  31640


Outliers in DISTI_RESALE
Mean:  561.8630344490593
Median:  210.0
Standard Deviation:  1151.5917317036085
Number of Outliers:  11690


Outliers in ADDERS_COST
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in TARGET_PRICE
Mean:  106.67836631333759
Median:  10.0
Standard Deviation:  434.59750845478794
Number of Outliers:  8666


Outliers in COMPETITOR_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in COMP_PART_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in QUOTE_JUSTIFICATION
Mean:  1.460951526032316
Median:  1.0
Standard Deviation:  1.0364632477562803
Number of Outliers:  77210


Outliers in REQUOTE_PRICE
Mean:  3739.184372275501
Median:  199.0
Standard Deviation:  13913.556601714774
Number of Outliers:  32340


Outliers in ADJ_DISTI_COST
Mean:  2572.652808609908
Median:  300.0
Standard Deviation:  10901.901732005757
Number of Outliers:  19670


Outliers in ADJ_DISTI_RESALE
Mean:  530.5064991123234
Median:  210.0
Standard Deviation:  907.3695677601581
Number of Outliers:  11410


Outliers in ORDER_ITEM_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in DESIGN_PART_MAPPING_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in STATUS
Mean:  nan
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in REF_PRICE
Mean:  1.3839753466872111
Median:  1.0
Standard Deviation:  3.8597036992698177
Number of Outliers:  140


Outliers in MINIMUM_PRICE
Mean:  2.334742180896027
Median:  2.0
Standard Deviation:  8.134352987179424
Number of Outliers:  140


Outliers in FIELD_MIN
Mean:  4267.437537552573
Median:  2.0
Standard Deviation:  15559.252538574245
Number of Outliers:  27930


Outliers in DISTI_BOOK_COST
Mean:  425.58013177325915
Median:  2.0
Standard Deviation:  5413.83933158489
Number of Outliers:  11830


Outliers in MANUFACTURER_COST
Mean:  9.63961038961039
Median:  2.0
Standard Deviation:  36.76738146933893
Number of Outliers:  3570


Outliers in MPP_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in STATUS.1
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in MPP_ITEM_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in STATUS.2
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in EXPIRATION_DATE
Mean:  3.051877618192699
Median:  0.0
Standard Deviation:  7.230239325466396
Number of Outliers:  87150


Outliers in RECOMMENDED_PRICE
Mean:  5496.694939271255
Median:  122.0
Standard Deviation:  17043.304010861575
Number of Outliers:  17850


Outliers in CONTRACT_ITEM_OID
Mean:  398565.81388390186
Median:  0.0
Standard Deviation:  726907.6932366411
Number of Outliers:  86520


Outliers in DEBIT_EXPIRATION_DATE
Mean:  12.925007480550569
Median:  13.0
Standard Deviation:  0.8357540370099139
Number of Outliers:  3500


Outliers in MPP_PRICE
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in CURRENCY_INFO_OID
Mean:  2.0
Median:  2.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in CODE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in EXCHANGE_RATE
Mean:  1.0
Median:  1.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in SEGMENT_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in DBC_EXCHANGE_RATE
Mean:  0.32966786355475763
Median:  0.0
Standard Deviation:  0.4700931329959174
Number of Outliers:  0


Outliers in PRICING_DESIGN_REG_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in REG_NUM
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in BUSINESS_RULE_OID
Mean:  0.7697486535008977
Median:  0.0
Standard Deviation:  8.004157986091068
Number of Outliers:  3430


Outliers in STATUS.3
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in PRICING_CONVERSION_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in STATUS.4
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in END_CUSTOMER_OID
Mean:  1905859.2780894674
Median:  1921147.0
Standard Deviation:  81355.74660317159
Number of Outliers:  4046


Outliers in DISPLAY_NAME
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
Mean:  2.848967684021544
Median:  0.0
Standard Deviation:  4.727571255887125
Number of Outliers:  92330


Outliers in PROGRAM_OID
Mean:  77.17010771992818
Median:  102.0
Standard Deviation:  39.63345981242074
Number of Outliers:  93366


Outliers in NAME
Mean:  7.318185218432077
Median:  6.0
Standard Deviation:  2.62132796624861
Number of Outliers:  35560


Outliers in ASSEMBLY_OID
Mean:  0.01335278276481149
Median:  0.0
Standard Deviation:  0.7128036679157977
Number of Outliers:  154


Outliers in NAME.1
Mean:  1.1999177139437462
Median:  1.0
Standard Deviation:  1.1662798096006568
Number of Outliers:  0


Outliers in AUTO_REQUOTE_PRICE
Mean:  3736.100587979094
Median:  199.0
Standard Deviation:  13907.887479489955
Number of Outliers:  32340


Outliers in AUTO_ADJ_DISTI_COST
Mean:  3252.0746483794883
Median:  222.0
Standard Deviation:  12764.84831458255
Number of Outliers:  31500


Outliers in AUTO_ADJ_DISTI_RESALE
Mean:  531.9677796944308
Median:  210.0
Standard Deviation:  912.2791611771639
Number of Outliers:  11480


Outliers in IS_NO_BID
Mean:  0.00037402752842609214
Median:  0.0
Standard Deviation:  0.019336200007353043
Number of Outliers:  140


Outliers in TYPE1_PRICING_CONVERSION_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in OEM_PRICE_DEF_OID
Mean:  0.8909709754637941
Median:  1.0
Standard Deviation:  0.31167604314702607
Number of Outliers:  40810


Outliers in NAME.2
Mean:  0.10902902453620586
Median:  0.0
Standard Deviation:  0.31167604314702607
Number of Outliers:  40810


Outliers in DBC_PRICE_DEF_OID
Mean:  0.3977408737283064
Median:  0.0
Standard Deviation:  0.6265152662868081
Number of Outliers:  1722


Outliers in NAME.3
Mean:  0.10902902453620586
Median:  0.0
Standard Deviation:  0.31167604314702607
Number of Outliers:  40810


Outliers in IS_SPECIAL_BUY
Mean:  0.009762118491921006
Median:  0.0
Standard Deviation:  0.09832011676487369
Number of Outliers:  3654


Outliers in WORKFLOW_STATUS
Mean:  2.4708632555356074
Median:  2.0
Standard Deviation:  1.3616563913412159
Number of Outliers:  87780


Outliers in RECOMMENDED_COST
Mean:  2782.5987093768754
Median:  259.0
Standard Deviation:  11520.98160568556
Number of Outliers:  19600


Outliers in IS_BUDGETARY
Mean:  0.46891831238779175
Median:  0.0
Standard Deviation:  0.4990336602096844
Number of Outliers:  0


Outliers in LAST_APPROVED_DATE
Mean:  485.7184694793537
Median:  530.0
Standard Deviation:  237.91771080688864
Number of Outliers:  0


Outliers in RECOMMENDED_RESALE
Mean:  542.7938151212903
Median:  210.0
Standard Deviation:  917.3868666732606
Number of Outliers:  11340


Outliers in ACCUMULATED_USAGE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in SUB_ITEM_NUM
Mean:  0.037066128067025736
Median:  0.0
Standard Deviation:  0.4233727370782561
Number of Outliers:  4662


Outliers in PART_TYPE
Mean:  0.05703919808497906
Median:  0.0
Standard Deviation:  0.5079423665941885
Number of Outliers:  4802


Outliers in DEBIT_START_DATE
Mean:  50.907166367444646
Median:  62.0
Standard Deviation:  18.41238513012249
Number of Outliers:  11410


Outliers in THRESHOLD_PRICE
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in REBATE_AMT
Mean:  137.27683144512667
Median:  0.0
Standard Deviation:  583.8960628622559
Number of Outliers:  53200


Outliers in DISTI_REBATE_AMT
Mean:  0.8467101781619333
Median:  0.0
Standard Deviation:  1.6340000697377453
Number of Outliers:  16800


Outliers in POCKET_PRICE
Mean:  5487.96948261924
Median:  122.0
Standard Deviation:  17030.884260240404
Number of Outliers:  21420


Outliers in POCKET_COST_PRICE
Mean:  406.0308568092429
Median:  208.0
Standard Deviation:  1371.2370373060305
Number of Outliers:  14980


Outliers in RECOMMENDED_POCKET_PRICE
Mean:  5496.694939271255
Median:  122.0
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
Standard Deviation:  17043.304010861575
Number of Outliers:  17850


Outliers in RECOMM_BUNDLE_POCKET_COST
Mean:  23.5
Median:  23.5
Standard Deviation:  0.5000163835050915
Number of Outliers:  0


Outliers in DISTI_POCKET_PRICE
Mean:  278.34413401109055
Median:  24.0
Standard Deviation:  1307.65458068554
Number of Outliers:  9842


Outliers in THRESHOLD_OVERRIDE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in IS_UNLIMITED_QUANTITY
Mean:  0.00037402752842609214
Median:  0.0
Standard Deviation:  0.01933620000735306
Number of Outliers:  140


Outliers in DEAL_SCORE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in IS_MATRIX
Mean:  0.2324581089168163
Median:  0.0
Standard Deviation:  0.42240006296382904
Number of Outliers:  87010


Outliers in QUANTITY_RECOMMENDED
Mean:  37570.33598892878
Median:  1000.0
Standard Deviation:  52473.49926594756
Number of Outliers:  0


Outliers in DEAL_AUTH_PRICE
Mean:  6390.259996907376
Median:  264.0
Standard Deviation:  18088.171528908733
Number of Outliers:  12670


Outliers in DEAL_AUTH_QUANTITY
Mean:  49742.11093206683
Median:  10000.0
Standard Deviation:  55118.55436494999
Number of Outliers:  0


Outliers in DEAL_AUTH_BUSINESS_RULE_OID
Mean:  55.96817025733094
Median:  82.0
Standard Deviation:  53.503454359794056
Number of Outliers:  0


Outliers in STATUS.5
Mean:  0.8825553560742071
Median:  1.0
Standard Deviation:  0.3219498042511717
Number of Outliers:  43960


Outliers in RECOMMENDED_REBATE_AMT
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in HIST_PRICE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_DISTI_BOOK_COST
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_DISTI_COST
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_DISTI_RESALE
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_MANUFACTURER_COST
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_CONTRACT_ITEM_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_MPP_ITEM_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in HIST_QUOTE_ITEM_OID
Mean:  0.0
Median:  0.0
Standard Deviation:  0.0
Number of Outliers:  0


Outliers in CUSTOM_FIELD1
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in CUSTOM_FIELD2
Mean:  61728395.0
Median:  61728395.0
Standard Deviation:  61950040.65209605
Number of Outliers:  0


Outliers in CUSTOM_FIELD3
Mean:  15.871895571514063
Median:  17.0
Standard Deviation:  3.5183919101591976
Number of Outliers:  38304


Outliers in CUSTOM_FIELD4
Mean:  1.2326451226810293
Median:  1.0
Standard Deviation:  0.4229608433944125
Number of Outliers:  87220


Outliers in CUSTOM_FIELD5
Mean:  0.9996259724715739
Median:  1.0
Standard Deviation:  0.019336200007353053
Number of Outliers:  140


/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
Outliers in CUSTOM_FIELD6
Mean:  22.824431478156793
Median:  24.0
Standard Deviation:  11.19443995713687
Number of Outliers:  0


Outliers in CUSTOM_FIELD7
Mean:  nan
Median:  nan
Standard Deviation:  nan
Number of Outliers:  0


Outliers in CUSTOM_FIELD8
Mean:  0.999812986235787
Median:  1.0
Standard Deviation:  0.01367403706312161
Number of Outliers:  70


Outliers in CUSTOM_FIELD9
Mean:  80.4375
Median:  28.0
Standard Deviation:  92.77556026622727
Number of Outliers:  0


Outliers in CUSTOM_FIELD10
Mean:  14.966113105924595
Median:  16.0
Standard Deviation:  3.392058924435953
Number of Outliers:  38234


Outliers in STATUS_ENCODED
Mean:  0.87413973668462
Median:  1.0
Standard Deviation:  0.3316922540022661
Number of Outliers:  47110


/home/arao/.local/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
In [25]:
for attribute in df_quote_item_all_encoded.columns:
    fig, ax = plt.subplots(figsize=(8,6))
    non_na_count = df_quote_item[attribute].count()
    num_bins = int(np.ceil(np.sqrt(non_na_count)))
    sns.histplot(data=df_quote_item, x=df_quote_item[attribute], bins=num_bins)
    
    # show distribution of each value. Use z-score bins as an input for the histogram plot. Using this, we can understand more about the distribution of each attribute. 
    
    individuaL_x_values = df_quote_item[attribute].unique()
    ax.tick_params(axis='x', labelrotation = 45)
    ax.set_title(f"{attribute} Distribution")
    ax.set_xlabel(attribute)
    ax.set_ylabel("Frequency")
    #plt.clf()
    # using distribution and frequency it's easy to identify the outliers. an added part here might be to store those outliers and filter them out accordingly. 
    plt.rcParams.update({'figure.max_open_warning': 0})  # allows you to display all the charts at once 
    
plt.show()
In [26]:
# this part was unused. 
# this type of heatmap can be used to show relationship of variables on a standardized scale (z_scores)
attribute_means = quotes_data_matrix.mean()
attribute_stds = quotes_data_matrix.std()

z_scores= quotes_data_matrix.dropna(inplace=True)
z_scores = quotes_data_matrix.apply(stats.zscore)



'''
##  outlier z-score for the quotes dataset

zscore_quotes_data_matrix = quotes_data_matrix
zscore_quotes_data_matrix.dropna(inplace=True)

zscore_quotes_data_matrix = zscore_quotes_data_matrix.apply(lambda x: (x - x.mean()) / x.std())


outliers_quotes = zscore_quotes_data_matrix[(zscore_quotes_data_matrix > 2) | (zscore_quotes_data_matrix < -2)]

#outliers rows
outliers_quotes_rows = zscore_quotes_data_matrix[outliers_quotes.any(axis=1)]
outlier_neat_quotes = pd.concat([outliers_quotes_rows, outliers_quotes[outliers_quotes.any(axis=1)]], axis=1)


print("Outliers: ")
print(outlier_neat_quotes.head(10))

plt.hist(outlier_neat_quotes['COMPANY_SIZE'])
plt.xlabel('COMPANY_SIZE')
plt.ylabel('Frequency')
plt.title('HISTOGRAM OF COMPANY SIZE Z-scores')

# there's only one outlier so can't put histogram

plt.show()
'''
Out[26]:
'\n##  outlier z-score for the quotes dataset\n\nzscore_quotes_data_matrix = quotes_data_matrix\nzscore_quotes_data_matrix.dropna(inplace=True)\n\nzscore_quotes_data_matrix = zscore_quotes_data_matrix.apply(lambda x: (x - x.mean()) / x.std())\n\n\noutliers_quotes = zscore_quotes_data_matrix[(zscore_quotes_data_matrix > 2) | (zscore_quotes_data_matrix < -2)]\n\n#outliers rows\noutliers_quotes_rows = zscore_quotes_data_matrix[outliers_quotes.any(axis=1)]\noutlier_neat_quotes = pd.concat([outliers_quotes_rows, outliers_quotes[outliers_quotes.any(axis=1)]], axis=1)\n\n\nprint("Outliers: ")\nprint(outlier_neat_quotes.head(10))\n\nplt.hist(outlier_neat_quotes[\'COMPANY_SIZE\'])\nplt.xlabel(\'COMPANY_SIZE\')\nplt.ylabel(\'Frequency\')\nplt.title(\'HISTOGRAM OF COMPANY SIZE Z-scores\')\n\n# there\'s only one outlier so can\'t put histogram\n\nplt.show()\n'

Box Plots¶

Heatmaps in comments¶
In [27]:
# this was an attempt to have the z-scores in a heatmap, create a pairplot, and also make box-plots. Based on the distribution and the format of the data however, the visualizations do not appear as useful as they would be with real customer data. 
sns.set()
sns.heatmap(z_scores, annot=True, cmap='coolwarm', square=True)
sns.pairplot(df_quotes_encoded, diag_kind="kde")
print(df_quotes_encoded)
fig, ax = plt.subplots(nrows=len(df_quotes_encoded.columns), ncols=1, figsize=(6, 2 * len(df_quotes_encoded.columns)))
for i, col in enumerate(df_quotes_encoded.columns):
    ax[i].boxplot(df_quotes_encoded[col])
    ax[i].set_title(col)
plt.tight_layout()
plt.style.use('seaborn-whitegrid')
print("true")
plt.show()
         OID  OBSOLETE_FLAG  CUSTOMER_OID            DISPLAY_NAME  \
0      11097              0       2954427  Netscout Systems, Inc.   
1      10455              0       2954427  Netscout Systems, Inc.   
2      10468              0       2972311                IO NODES   
3      10527              0       2972311                IO NODES   
4      10396              0       2988243        Alpha Beta Gamma   
...      ...            ...           ...                     ...   
44344  45506              0       1339006                   kevin   
44345  45590              0       1339006                   kevin   
44346  45828              0       1339006                   kevin   
44347  45829              0       1339006                   kevin   
44348  45830              0       1339006                   kevin   

       SALES_CHANNEL  DISTRIBUTOR_OID  NAME  QUOTE_STATUS_OID  PREP_COMPLETE  \
0                  1                1    28                 1              1   
1                  1                1    28                 1              1   
2                  1               14    15                 1              1   
3                  1               14    15                 1              1   
4                  1               18    22                 1              1   
...              ...              ...   ...               ...            ...   
44344              0                0    39                 0              0   
44345              0                0    39                 0              0   
44346              0                0    39                 0              0   
44347              0                0    39                 0              1   
44348              0                0    39                 0              1   

       IS_FORWARD  ...  CUSTOM_FIELD4                    CUSTOM_FIELD5  \
0               0  ...            NaN  rajeshwara.c.dwantham@intel.com   
1               0  ...            NaN  rajeshwara.c.dwantham@intel.com   
2               0  ...            NaN   theresa.s.colldeweih@intel.com   
3               0  ...            NaN   theresa.s.colldeweih@intel.com   
4               0  ...            NaN               success@modeln.com   
...           ...  ...            ...                              ...   
44344           0  ...            NaN                              NaN   
44345           0  ...            NaN                              NaN   
44346           0  ...            NaN                              NaN   
44347           0  ...            NaN                              NaN   
44348           0  ...            NaN                              NaN   

       CUSTOM_FIELD6  CUSTOM_FIELD7  CUSTOM_FIELD8  CUSTOM_FIELD9  \
0               AM03     Enterprise             No            OEM   
1               AM03     Enterprise             No            OEM   
2                NAR     Enterprise             No            OEM   
3               AM03     Enterprise             No            OEM   
4                NAR     Enterprise             No            OEM   
...              ...            ...            ...            ...   
44344            NaN            NaN            NaN            NaN   
44345            NaN            NaN            NaN            NaN   
44346            NaN            NaN            NaN            NaN   
44347            NaN            NaN            NaN            NaN   
44348            NaN            NaN            NaN            NaN   

      CUSTOM_FIELD10 COMPANY_SIZE COMPANY_TYPE ENTERPRISE_CHECK  
0            90 Days            2           14                2  
1            90 Days            5           14                2  
2            90 Days            2            8                2  
3            90 Days            2           14                2  
4            90 Days            3           14                2  
...              ...          ...          ...              ...  
44344            NaN           10           14                5  
44345            NaN           10           14                5  
44346            NaN           10           14                5  
44347            NaN           10           14                5  
44348            NaN           10           14                5  

[44349 rows x 28 columns]
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_11689/4227734508.py in <module>
      5 fig, ax = plt.subplots(nrows=len(df_quotes_encoded.columns), ncols=1, figsize=(6, 2 * len(df_quotes_encoded.columns)))
      6 for i, col in enumerate(df_quotes_encoded.columns):
----> 7     ax[i].boxplot(df_quotes_encoded[col])
      8     ax[i].set_title(col)
      9 plt.tight_layout()

~/.local/lib/python3.7/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
   1412     def inner(ax, *args, data=None, **kwargs):
   1413         if data is None:
-> 1414             return func(ax, *map(sanitize_sequence, args), **kwargs)
   1415 
   1416         bound = new_sig.bind(ax, *args, **kwargs)

~/.local/lib/python3.7/site-packages/matplotlib/axes/_axes.py in boxplot(self, x, notch, sym, vert, whis, positions, widths, patch_artist, bootstrap, usermedians, conf_intervals, meanline, showmeans, showcaps, showbox, showfliers, boxprops, labels, flierprops, medianprops, meanprops, capprops, whiskerprops, manage_ticks, autorange, zorder)
   3709 
   3710         bxpstats = cbook.boxplot_stats(x, whis=whis, bootstrap=bootstrap,
-> 3711                                        labels=labels, autorange=autorange)
   3712         if notch is None:
   3713             notch = rcParams['boxplot.notch']

~/.local/lib/python3.7/site-packages/matplotlib/cbook/__init__.py in boxplot_stats(X, whis, bootstrap, labels, autorange)
   1193 
   1194         # arithmetic mean
-> 1195         stats['mean'] = np.mean(x)
   1196 
   1197         # medians and quartiles

<__array_function__ internals> in mean(*args, **kwargs)

~/.local/lib/python3.7/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims, where)
   3439 
   3440     return _methods._mean(a, axis=axis, dtype=dtype,
-> 3441                           out=out, **kwargs)
   3442 
   3443 

~/.local/lib/python3.7/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims, where)
    189             ret = ret.dtype.type(ret / rcount)
    190     else:
--> 191         ret = ret / rcount
    192 
    193     return ret

TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

PCA Analysis¶

In [28]:
from sklearn.preprocessing import StandardScaler

#define features based on most correlated df_quote_item values
df_quote_item_encoded = df_quote_item_encoded
## merge dataframes on OID and add better features
features=['OID','PART_OID', 'QUANTITY_REQUESTED', 'DISTI_REBATE_AMT', 'ITEM_NUM', 'REBATE_AMT']

# handle missing values
#df_quote_item_encoded.dropna(subset=['STATUS_ENCODED'], inplace=True)
df_quote_item_encoded.fillna(999, inplace=True) # here will filled in the NA vaulues with 999 
print(df_quote_item_encoded['ACTUAL_STATUS'].str.count('Approved').sum())

x = df_quote_item_encoded.loc[:, features].values # features
y= df_quote_item_encoded.loc[:,['ACTUAL_STATUS']].values # target variable
print(df_quote_item_encoded['ACTUAL_STATUS'].str.count('Approved').sum())

x = StandardScaler().fit_transform(x)
print(df_quote_item_encoded['ACTUAL_STATUS'].str.count('Approved').sum())
print(df_quote_item_encoded['ACTUAL_STATUS'].head(300000))
47110
47110
47110
0         Other
1         Other
2         Other
3         Other
4         Other
          ...  
299995    Other
299996    Other
299997    Other
299998    Other
299999    Other
Name: ACTUAL_STATUS, Length: 300000, dtype: object
In [29]:
# find how many components explain the variance. Recall that each component is made up of a certain amount of attributes. 
from sklearn.decomposition import PCA
pca = PCA().fit(x)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

# usually select the amount of features that calls for 90+% of variance. 
Out[29]:
Text(0, 0.5, 'cumulative explained variance')

4 Component PCA Analysis¶

In [30]:
pca = PCA(n_components=4) # 4 components because it above 90% but not at 100%. 
principalComponents = pca.fit_transform(x)

df_principal = pd.DataFrame(data=principalComponents, columns=['principal component #1','principal component #2', 'principal component #3','principal component #4'])
df_final_principal = pd.concat([df_principal, df_quote_item_encoded[['ACTUAL_STATUS']]], axis=1)
print("done")
done
In [31]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)

ax.set_xlabel('Principal Component 1', fontsize=15)
ax.set_ylabel('Principal Component 2', fontsize=15)
ax.set_title('2 Component PCA', fontsize=20)

ACTUAL_STATUS = ['Approved', 'Other']
colors=['red', 'green'] # had to switch these around afterwards, because the colors were not matching up properly. 

#plt.scatter(df_final_principal[:, 0], df_final_principal[:, 1], )

for status, color in zip(ACTUAL_STATUS, colors):
    saved_indices = (df_final_principal['ACTUAL_STATUS'] == status)
    print(saved_indices)
    ax.scatter(df_final_principal.loc[saved_indices, 'principal component #1'], df_final_principal.loc[saved_indices, 'principal component #2'], c=color, s=40)
ax.legend(['Other', 'Approved']) #switch cause incorrect order first time
# other is referring to quotes which had workflow status set as in progress or rejected.  
ax.grid()    



# we are only able to visualize 2 PCA's on the scatterplot. 
0         False
1         False
2         False
3         False
4         False
          ...  
374299    False
374300    False
374301    False
374302    False
374303    False
Name: ACTUAL_STATUS, Length: 374304, dtype: bool
0         True
1         True
2         True
3         True
4         True
          ... 
374299    True
374300    True
374301    True
374302    True
374303    True
Name: ACTUAL_STATUS, Length: 374304, dtype: bool
In [32]:
print(pca.explained_variance_ratio_) # high variance in each PCA component is also a measure of how well it is maintaining a dataset. 
loadings = pca.components_
importance = np.abs(loadings).sum()
print(pd.DataFrame(loadings.T, columns=['principal component #1', 'principal component #2','principal component #3', 'principal component #4' ], index=features))

pc1 = pca.explained_variance_ratio_[0] * 100
pc2 = pca.explained_variance_ratio_[1] * 100 
pc3  =  pca.explained_variance_ratio_[2] * 100 
pc4 =  pca.explained_variance_ratio_[3] * 100 

print(f"\nThe first component explains {pc1}% of variance\nThe second principal component explains {pc2}% of variance\nThe third principal component explains {pc3}% of variance\nThe fourth principal component explains {pc4}% of variance\nTotal Variance Explained:", pc1 + pc2 + pc3 + pc4,"%")

plt.pie([pc1, pc2, pc3, pc4], labels=['principal component #1', 'principal component #2','principal component #3', 'principal component #4'], autopct='%1.1f%%')

# the first four principal components are preserving 87% of the original dataset. 
[0.45954915 0.167024   0.14225908 0.10224955]
                    principal component #1  principal component #2  \
OID                               0.520268               -0.017660   
PART_OID                         -0.155715                0.880522   
QUANTITY_REQUESTED               -0.383694               -0.314262   
DISTI_REBATE_AMT                 -0.498769               -0.223840   
ITEM_NUM                         -0.436197               -0.019600   
REBATE_AMT                        0.344696               -0.274082   

                    principal component #3  principal component #4  
OID                              -0.126738                0.171882  
PART_OID                          0.408014                0.019247  
QUANTITY_REQUESTED                0.401942                0.713070  
DISTI_REBATE_AMT                  0.208233               -0.363746  
ITEM_NUM                         -0.326339               -0.363848  
REBATE_AMT                        0.711370               -0.443757  

The first component explains 45.954914878502215% of variance
The second principal component explains 16.70240022684773% of variance
The third principal component explains 14.225908494149575% of variance
The fourth principal component explains 10.224954653541358% of variance
Total Variance Explained: 87.10817825304088 %
Out[32]:
([<matplotlib.patches.Wedge at 0x7fb79e129b10>,
  <matplotlib.patches.Wedge at 0x7fb79e12c350>,
  <matplotlib.patches.Wedge at 0x7fb79e12cb90>,
  <matplotlib.patches.Wedge at 0x7fb79e12c890>],
 [Text(-0.09512648248825806, 1.095879077421141, 'principal component #1'),
  Text(-0.7854377182107304, -0.7701218025818523, 'principal component #2'),
  Text(0.34623052445573216, -1.0440902374484253, 'principal component #3'),
  Text(1.026049664335851, -0.3965124037357314, 'principal component #4')],
 [Text(-0.05188717226632258, 0.5977522240478951, '52.8%'),
  Text(-0.42842057356948926, -0.4200664377719194, '19.2%'),
  Text(0.18885301333949026, -0.5695037658809592, '16.3%'),
  Text(0.5596634532741005, -0.21627949294676255, '11.7%')])

2 Component Logistic Regression PCA Analysis¶

In [33]:
from sklearn.model_selection import train_test_split
X = df_quote_item_encoded.loc[:, features].values
y= df_quote_item_encoded.loc[:,['STATUS_ENCODED']].values
print(df_quote_item_encoded['STATUS_ENCODED'])

X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size = 0.25, random_state = 0)

explained_variance = pca.explained_variance_ratio_
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

explained_variance = pca.explained_variance_ratio_

from sklearn.linear_model import LogisticRegression  

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
0         1
1         1
2         1
3         1
4         1
         ..
374299    1
374300    1
374301    1
374302    1
374303    1
Name: STATUS_ENCODED, Length: 374304, dtype: int64
Out[33]:
LogisticRegression(random_state=0)
In [34]:
# this was tested from a geeksforgeeks tutorial. Same logic was applied here just to see if there would be any differences in the visualizations. 
from sklearn.metrics import confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap
y_pred = classifier.predict(X_test)



cm = confusion_matrix(y_test, y_pred)


X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1,
                     stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1,
                     stop = X_set[:, 1].max() + 1, step = 0.01))

plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
             X2.ravel()]).T).reshape(X1.shape), alpha = 0.75,
             cmap = ListedColormap(('yellow', 'white', 'aquamarine')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = ListedColormap(('red', 'green'))(i), label = j)

plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1') 
plt.ylabel('PC2') 
plt.legend()


plt.show() #1 success, 2 other

print("accuracy: ", accuracy_score(y_test, y_pred))
accuracy:  0.959968367957596
In [35]:
print(pca.explained_variance_ratio_) # the first two principal components are preserving 62% of the original dataset. 
loadings = pca.components_
importance = np.abs(loadings).sum()
print(pd.DataFrame(loadings.T, columns=['principal component #1', 'principal component #2'], index=features))

pc1 = pca.explained_variance_ratio_[0] * 100
pc2 = pca.explained_variance_ratio_[1] * 100 

print(f"\nThe first component explains {pc1}% of variance\nThe second principal component explains {pc2}% of variance", pc1 + pc2,"%")
[0.45949775 0.16701913]
                    principal component #1  principal component #2
OID                               0.520373               -0.016255
PART_OID                         -0.155952                0.878987
QUANTITY_REQUESTED               -0.383733               -0.315560
DISTI_REBATE_AMT                 -0.499068               -0.224565
ITEM_NUM                         -0.436352               -0.017510
REBATE_AMT                        0.343755               -0.277133

The first component explains 45.94977500199063% of variance
The second principal component explains 16.701912895219817% of variance 62.65168789721045 %
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: